From 9f80ccec9e074d406af6d3a4355c3371221080b0 Mon Sep 17 00:00:00 2001 From: Jessica Del <50999226+OutOfCache@users.noreply.github.com> Date: Mon, 22 Jul 2024 18:04:49 +0200 Subject: [PATCH] [AMDGPU] Add intrinsic for raw atomic buffer loads (#97707) Upstream the intrinsics `llvm.amdgcn.raw.atomic.buffer.load` and `llvm.amdgcn.raw.atomic.ptr.buffer.load`. These additional intrinsics mark atomic buffer loads as atomic to LLVM by removing the `IntrReadMem` attribute. Otherwise, it could hoist these intrinsics out of loops in cases where LLVM marks them as invariant. That can cause issues such as infinite loops. Continuation of https://reviews.llvm.org/D138786 with the additional use in the fat buffer lowering, more test cases and the additional ptr versions of these intrinsics. --------- Co-authored-by: rtayl <> Co-authored-by: Jay Foad Co-authored-by: Mariusz Sikora --- llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 26 ++ .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 2 + .../AMDGPU/AMDGPULowerBufferFatPointers.cpp | 5 +- .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 2 + llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 10 + .../llvm.amdgcn.raw.atomic.buffer.load.ll | 304 ++++++++++++++++++ .../llvm.amdgcn.raw.ptr.atomic.buffer.load.ll | 304 ++++++++++++++++++ .../lower-buffer-fat-pointers-memops.ll | 6 +- 8 files changed, 654 insertions(+), 5 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index ca85ff30f683f3..ab2620fdcf6b33 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1138,6 +1138,19 @@ class AMDGPURawBufferLoad : DefaultAttrsIntrinsi def int_amdgcn_raw_buffer_load_format : AMDGPURawBufferLoad; def int_amdgcn_raw_buffer_load : AMDGPURawBufferLoad; +class AMDGPURawAtomicBufferLoad : Intrinsic < + [data_ty], + [llvm_v4i32_ty, // rsrc(SGPR) + llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) + llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) + llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, + // bit 1 = slc, + // bit 2 = dlc on gfx10+), + // swizzled buffer (bit 3 = swz)) + [ImmArg>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, + AMDGPURsrcIntrinsic<0>; +def int_amdgcn_raw_atomic_buffer_load : AMDGPURawAtomicBufferLoad; + class AMDGPURawPtrBufferLoad : DefaultAttrsIntrinsic < [data_ty], [AMDGPUBufferRsrcTy, // rsrc(SGPR) @@ -1156,6 +1169,19 @@ class AMDGPURawPtrBufferLoad : DefaultAttrsIntri def int_amdgcn_raw_ptr_buffer_load_format : AMDGPURawPtrBufferLoad; def int_amdgcn_raw_ptr_buffer_load : AMDGPURawPtrBufferLoad; +class AMDGPURawPtrAtomicBufferLoad : Intrinsic < + [data_ty], + [AMDGPUBufferRsrcTy,// rsrc(SGPR) + llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) + llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) + llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, + // bit 1 = slc, + // bit 2 = dlc on gfx10+), + // swizzled buffer (bit 3 = swz)) + [IntrArgMemOnly, NoCapture>, ImmArg>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, + AMDGPURsrcIntrinsic<0>; +def int_amdgcn_raw_ptr_atomic_buffer_load : AMDGPURawPtrAtomicBufferLoad; + class AMDGPUStructBufferLoad : DefaultAttrsIntrinsic < [data_ty], [llvm_v4i32_ty, // rsrc(SGPR) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 89ef0f299feabb..74e93b0620d26e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -7366,6 +7366,8 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, return legalizeBufferStore(MI, MRI, B, true, true); case Intrinsic::amdgcn_raw_buffer_load: case Intrinsic::amdgcn_raw_ptr_buffer_load: + case Intrinsic::amdgcn_raw_atomic_buffer_load: + case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load: case Intrinsic::amdgcn_struct_buffer_load: case Intrinsic::amdgcn_struct_ptr_buffer_load: return legalizeBufferLoad(MI, MRI, B, false, false); diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp index 6be9be21a8a861..77971323aa1ec6 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp @@ -1092,8 +1092,9 @@ Value *SplitPtrStructs::handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr, Intrinsic::ID IID = Intrinsic::not_intrinsic; if (isa(I)) - // TODO: Do we need to do something about atomic loads? - IID = Intrinsic::amdgcn_raw_ptr_buffer_load; + IID = Order == AtomicOrdering::NotAtomic + ? Intrinsic::amdgcn_raw_ptr_buffer_load + : Intrinsic::amdgcn_raw_ptr_atomic_buffer_load; else if (isa(I)) IID = Intrinsic::amdgcn_raw_ptr_buffer_store; else if (auto *RMW = dyn_cast(I)) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 68f47674587032..aa329a58547f3d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4985,6 +4985,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { } case Intrinsic::amdgcn_raw_buffer_load: case Intrinsic::amdgcn_raw_ptr_buffer_load: + case Intrinsic::amdgcn_raw_atomic_buffer_load: + case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load: case Intrinsic::amdgcn_raw_tbuffer_load: case Intrinsic::amdgcn_raw_ptr_tbuffer_load: { // FIXME: Should make intrinsic ID the last operand of the instruction, diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index a09e0ad2c0c296..7f95442401dbc2 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1277,6 +1277,14 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.ptrVal = CI.getArgOperand(1); return true; } + case Intrinsic::amdgcn_raw_atomic_buffer_load: + case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load: { + Info.memVT = + memVTFromLoadIntrReturn(*this, MF.getDataLayout(), CI.getType(), + std::numeric_limits::max()); + Info.flags &= ~MachineMemOperand::MOStore; + return true; + } } } return true; @@ -8889,6 +8897,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, } case Intrinsic::amdgcn_raw_buffer_load: case Intrinsic::amdgcn_raw_ptr_buffer_load: + case Intrinsic::amdgcn_raw_atomic_buffer_load: + case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load: case Intrinsic::amdgcn_raw_buffer_load_format: case Intrinsic::amdgcn_raw_ptr_buffer_load_format: { const bool IsFormat = diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll new file mode 100644 index 00000000000000..03f94d6e853f0b --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll @@ -0,0 +1,304 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -global-isel=0 | FileCheck %s -check-prefix=CHECK +; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -global-isel=1 | FileCheck %s -check-prefix=CHECK + +define amdgpu_kernel void @raw_atomic_buffer_load_i32(<4 x i32> %addr) { +; CHECK-LABEL: raw_atomic_buffer_load_i32: +; CHECK: ; %bb.0: ; %bb +; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: .LBB0_1: ; %bb1 +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 glc +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; CHECK-NEXT: s_cbranch_execnz .LBB0_1 +; CHECK-NEXT: ; %bb.2: ; %bb2 +; CHECK-NEXT: s_endpgm +bb: + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + br label %bb1 +bb1: + %load = call i32 @llvm.amdgcn.raw.atomic.buffer.load.i32(<4 x i32> %addr, i32 0, i32 0, i32 1) + %cmp = icmp eq i32 %load, %id + br i1 %cmp, label %bb1, label %bb2 +bb2: + ret void +} + +define amdgpu_kernel void @raw_atomic_buffer_load_i32_off(<4 x i32> %addr) { +; CHECK-LABEL: raw_atomic_buffer_load_i32_off: +; CHECK: ; %bb.0: ; %bb +; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: .LBB1_1: ; %bb1 +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 glc +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; CHECK-NEXT: s_cbranch_execnz .LBB1_1 +; CHECK-NEXT: ; %bb.2: ; %bb2 +; CHECK-NEXT: s_endpgm +bb: + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + br label %bb1 +bb1: + %load = call i32 @llvm.amdgcn.raw.atomic.buffer.load.i32(<4 x i32> %addr, i32 0, i32 0, i32 1) + %cmp = icmp eq i32 %load, %id + br i1 %cmp, label %bb1, label %bb2 +bb2: + ret void +} +define amdgpu_kernel void @raw_atomic_buffer_load_i32_soff(<4 x i32> %addr) { +; CHECK-LABEL: raw_atomic_buffer_load_i32_soff: +; CHECK: ; %bb.0: ; %bb +; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: .LBB2_1: ; %bb1 +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 4 offset:4 glc +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; CHECK-NEXT: s_cbranch_execnz .LBB2_1 +; CHECK-NEXT: ; %bb.2: ; %bb2 +; CHECK-NEXT: s_endpgm +bb: + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + br label %bb1 +bb1: + %load = call i32 @llvm.amdgcn.raw.atomic.buffer.load.i32(<4 x i32> %addr, i32 4, i32 4, i32 1) + %cmp = icmp eq i32 %load, %id + br i1 %cmp, label %bb1, label %bb2 +bb2: + ret void +} +define amdgpu_kernel void @raw_atomic_buffer_load_i32_dlc(<4 x i32> %addr) { +; CHECK-LABEL: raw_atomic_buffer_load_i32_dlc: +; CHECK: ; %bb.0: ; %bb +; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: .LBB3_1: ; %bb1 +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 offset:4 dlc +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; CHECK-NEXT: s_cbranch_execnz .LBB3_1 +; CHECK-NEXT: ; %bb.2: ; %bb2 +; CHECK-NEXT: s_endpgm +bb: + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + br label %bb1 +bb1: + %load = call i32 @llvm.amdgcn.raw.atomic.buffer.load.i32(<4 x i32> %addr, i32 4, i32 0, i32 4) + %cmp = icmp eq i32 %load, %id + br i1 %cmp, label %bb1, label %bb2 +bb2: + ret void +} + +define amdgpu_kernel void @raw_nonatomic_buffer_load_i32(<4 x i32> %addr) { +; CHECK-LABEL: raw_nonatomic_buffer_load_i32: +; CHECK: ; %bb.0: ; %bb +; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 offset:4 glc +; CHECK-NEXT: s_mov_b32 s0, 0 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; CHECK-NEXT: .LBB4_1: ; %bb1 +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_and_b32 s1, exec_lo, vcc_lo +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; CHECK-NEXT: s_or_b32 s0, s1, s0 +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; CHECK-NEXT: s_cbranch_execnz .LBB4_1 +; CHECK-NEXT: ; %bb.2: ; %bb2 +; CHECK-NEXT: s_endpgm +bb: + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + br label %bb1 +bb1: + %load = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %addr, i32 4, i32 0, i32 1) + %cmp = icmp eq i32 %load, %id + br i1 %cmp, label %bb1, label %bb2 +bb2: + ret void +} + +define amdgpu_kernel void @raw_atomic_buffer_load_i64(<4 x i32> %addr) { +; CHECK-LABEL: raw_atomic_buffer_load_i64: +; CHECK: ; %bb.0: ; %bb +; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: .LBB5_1: ; %bb1 +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_load_b64 v[2:3], off, s[0:3], 0 offset:4 glc +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1] +; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; CHECK-NEXT: s_cbranch_execnz .LBB5_1 +; CHECK-NEXT: ; %bb.2: ; %bb2 +; CHECK-NEXT: s_endpgm +bb: + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %id.zext = zext i32 %id to i64 + br label %bb1 +bb1: + %load = call i64 @llvm.amdgcn.raw.atomic.buffer.load.i64(<4 x i32> %addr, i32 4, i32 0, i32 1) + %cmp = icmp eq i64 %load, %id.zext + br i1 %cmp, label %bb1, label %bb2 +bb2: + ret void +} + +define amdgpu_kernel void @raw_atomic_buffer_load_v2i16(<4 x i32> %addr) { +; CHECK-LABEL: raw_atomic_buffer_load_v2i16: +; CHECK: ; %bb.0: ; %bb +; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: .LBB6_1: ; %bb1 +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 glc +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; CHECK-NEXT: s_cbranch_execnz .LBB6_1 +; CHECK-NEXT: ; %bb.2: ; %bb2 +; CHECK-NEXT: s_endpgm +bb: + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + br label %bb1 +bb1: + %load = call <2 x i16> @llvm.amdgcn.raw.atomic.buffer.load.v2i16(<4 x i32> %addr, i32 0, i32 0, i32 1) + %bitcast = bitcast <2 x i16> %load to i32 + %cmp = icmp eq i32 %bitcast, %id + br i1 %cmp, label %bb1, label %bb2 +bb2: + ret void +} + +define amdgpu_kernel void @raw_atomic_buffer_load_v4i16(<4 x i32> %addr) { +; CHECK-LABEL: raw_atomic_buffer_load_v4i16: +; CHECK: ; %bb.0: ; %bb +; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: .LBB7_1: ; %bb1 +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; CHECK-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; CHECK-NEXT: s_cbranch_execnz .LBB7_1 +; CHECK-NEXT: ; %bb.2: ; %bb2 +; CHECK-NEXT: s_endpgm +bb: + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + br label %bb1 +bb1: + %load = call <4 x i16> @llvm.amdgcn.raw.atomic.buffer.load.v4i16(<4 x i32> %addr, i32 4, i32 0, i32 1) + %shortened = shufflevector <4 x i16> %load, <4 x i16> poison, <2 x i32> + %bitcast = bitcast <2 x i16> %shortened to i32 + %cmp = icmp eq i32 %bitcast, %id + br i1 %cmp, label %bb1, label %bb2 +bb2: + ret void +} + +define amdgpu_kernel void @raw_atomic_buffer_load_v4i32(<4 x i32> %addr) { +; CHECK-LABEL: raw_atomic_buffer_load_v4i32: +; CHECK: ; %bb.0: ; %bb +; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: .LBB8_1: ; %bb1 +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_load_b128 v[1:4], off, s[0:3], 0 offset:4 glc +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v4, v0 +; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; CHECK-NEXT: s_cbranch_execnz .LBB8_1 +; CHECK-NEXT: ; %bb.2: ; %bb2 +; CHECK-NEXT: s_endpgm +bb: + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + br label %bb1 +bb1: + %load = call <4 x i32> @llvm.amdgcn.raw.atomic.buffer.load.v4i32(<4 x i32> %addr, i32 4, i32 0, i32 1) + %extracted = extractelement <4 x i32> %load, i32 3 + %cmp = icmp eq i32 %extracted, %id + br i1 %cmp, label %bb1, label %bb2 +bb2: + ret void +} + +define amdgpu_kernel void @raw_atomic_buffer_load_ptr(<4 x i32> %addr) { +; CHECK-LABEL: raw_atomic_buffer_load_ptr: +; CHECK: ; %bb.0: ; %bb +; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: .LBB9_1: ; %bb1 +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_load_b32 v1, v[1:2] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; CHECK-NEXT: s_cbranch_execnz .LBB9_1 +; CHECK-NEXT: ; %bb.2: ; %bb2 +; CHECK-NEXT: s_endpgm +bb: + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + br label %bb1 +bb1: + %load = call ptr @llvm.amdgcn.raw.atomic.buffer.load.ptr(<4 x i32> %addr, i32 4, i32 0, i32 1) + %elem = load i32, ptr %load + %cmp = icmp eq i32 %elem, %id + br i1 %cmp, label %bb1, label %bb2 +bb2: + ret void +} + +; Function Attrs: nounwind readonly +declare i32 @llvm.amdgcn.raw.atomic.buffer.load.i32(<4 x i32>, i32, i32, i32 immarg) +declare i64 @llvm.amdgcn.raw.atomic.buffer.load.i64(<4 x i32>, i32, i32, i32 immarg) +declare <2 x i16> @llvm.amdgcn.raw.atomic.buffer.load.v2i16(<4 x i32>, i32, i32, i32 immarg) +declare <4 x i16> @llvm.amdgcn.raw.atomic.buffer.load.v4i16(<4 x i32>, i32, i32, i32 immarg) +declare <4 x i32> @llvm.amdgcn.raw.atomic.buffer.load.v4i32(<4 x i32>, i32, i32, i32 immarg) +declare ptr @llvm.amdgcn.raw.atomic.buffer.load.ptr(<4 x i32>, i32, i32, i32 immarg) +declare i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32>, i32, i32, i32 immarg) +declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll new file mode 100644 index 00000000000000..3228335073d077 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll @@ -0,0 +1,304 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -global-isel=0 | FileCheck %s -check-prefix=CHECK +; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -global-isel=1 | FileCheck %s -check-prefix=CHECK + +define amdgpu_kernel void @raw_ptr_atomic_buffer_ptr_load_i32(ptr addrspace(8) %ptr) { +; CHECK-LABEL: raw_ptr_atomic_buffer_ptr_load_i32: +; CHECK: ; %bb.0: ; %bb +; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: .LBB0_1: ; %bb1 +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 glc +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; CHECK-NEXT: s_cbranch_execnz .LBB0_1 +; CHECK-NEXT: ; %bb.2: ; %bb2 +; CHECK-NEXT: s_endpgm +bb: + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + br label %bb1 +bb1: + %load = call i32 @llvm.amdgcn.raw.ptr.atomic.buffer.load.i32(ptr addrspace(8) %ptr, i32 0, i32 0, i32 1) + %cmp = icmp eq i32 %load, %id + br i1 %cmp, label %bb1, label %bb2 +bb2: + ret void +} + +define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i32_off(ptr addrspace(8) %ptr) { +; CHECK-LABEL: raw_ptr_atomic_buffer_load_i32_off: +; CHECK: ; %bb.0: ; %bb +; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: .LBB1_1: ; %bb1 +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 glc +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; CHECK-NEXT: s_cbranch_execnz .LBB1_1 +; CHECK-NEXT: ; %bb.2: ; %bb2 +; CHECK-NEXT: s_endpgm +bb: + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + br label %bb1 +bb1: + %load = call i32 @llvm.amdgcn.raw.ptr.atomic.buffer.load.i32(ptr addrspace(8) %ptr, i32 0, i32 0, i32 1) + %cmp = icmp eq i32 %load, %id + br i1 %cmp, label %bb1, label %bb2 +bb2: + ret void +} +define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i32_soff(ptr addrspace(8) %ptr) { +; CHECK-LABEL: raw_ptr_atomic_buffer_load_i32_soff: +; CHECK: ; %bb.0: ; %bb +; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: .LBB2_1: ; %bb1 +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 4 offset:4 glc +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; CHECK-NEXT: s_cbranch_execnz .LBB2_1 +; CHECK-NEXT: ; %bb.2: ; %bb2 +; CHECK-NEXT: s_endpgm +bb: + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + br label %bb1 +bb1: + %load = call i32 @llvm.amdgcn.raw.ptr.atomic.buffer.load.i32(ptr addrspace(8) %ptr, i32 4, i32 4, i32 1) + %cmp = icmp eq i32 %load, %id + br i1 %cmp, label %bb1, label %bb2 +bb2: + ret void +} +define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i32_dlc(ptr addrspace(8) %ptr) { +; CHECK-LABEL: raw_ptr_atomic_buffer_load_i32_dlc: +; CHECK: ; %bb.0: ; %bb +; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: .LBB3_1: ; %bb1 +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 offset:4 dlc +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; CHECK-NEXT: s_cbranch_execnz .LBB3_1 +; CHECK-NEXT: ; %bb.2: ; %bb2 +; CHECK-NEXT: s_endpgm +bb: + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + br label %bb1 +bb1: + %load = call i32 @llvm.amdgcn.raw.ptr.atomic.buffer.load.i32(ptr addrspace(8) %ptr, i32 4, i32 0, i32 4) + %cmp = icmp eq i32 %load, %id + br i1 %cmp, label %bb1, label %bb2 +bb2: + ret void +} + +define amdgpu_kernel void @raw_nonptr_atomic_buffer_load_i32(ptr addrspace(8) %ptr) { +; CHECK-LABEL: raw_nonptr_atomic_buffer_load_i32: +; CHECK: ; %bb.0: ; %bb +; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 offset:4 glc +; CHECK-NEXT: s_mov_b32 s0, 0 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; CHECK-NEXT: .LBB4_1: ; %bb1 +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_and_b32 s1, exec_lo, vcc_lo +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; CHECK-NEXT: s_or_b32 s0, s1, s0 +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 +; CHECK-NEXT: s_cbranch_execnz .LBB4_1 +; CHECK-NEXT: ; %bb.2: ; %bb2 +; CHECK-NEXT: s_endpgm +bb: + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + br label %bb1 +bb1: + %load = call i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) %ptr, i32 4, i32 0, i32 1) + %cmp = icmp eq i32 %load, %id + br i1 %cmp, label %bb1, label %bb2 +bb2: + ret void +} + +define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i64(ptr addrspace(8) %ptr) { +; CHECK-LABEL: raw_ptr_atomic_buffer_load_i64: +; CHECK: ; %bb.0: ; %bb +; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: .LBB5_1: ; %bb1 +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_load_b64 v[2:3], off, s[0:3], 0 offset:4 glc +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1] +; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; CHECK-NEXT: s_cbranch_execnz .LBB5_1 +; CHECK-NEXT: ; %bb.2: ; %bb2 +; CHECK-NEXT: s_endpgm +bb: + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + %id.zext = zext i32 %id to i64 + br label %bb1 +bb1: + %load = call i64 @llvm.amdgcn.raw.ptr.atomic.buffer.load.i64(ptr addrspace(8) %ptr, i32 4, i32 0, i32 1) + %cmp = icmp eq i64 %load, %id.zext + br i1 %cmp, label %bb1, label %bb2 +bb2: + ret void +} + +define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v2i16(ptr addrspace(8) %ptr) { +; CHECK-LABEL: raw_ptr_atomic_buffer_load_v2i16: +; CHECK: ; %bb.0: ; %bb +; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: .LBB6_1: ; %bb1 +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 glc +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; CHECK-NEXT: s_cbranch_execnz .LBB6_1 +; CHECK-NEXT: ; %bb.2: ; %bb2 +; CHECK-NEXT: s_endpgm +bb: + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + br label %bb1 +bb1: + %load = call <2 x i16> @llvm.amdgcn.raw.ptr.atomic.buffer.load.v2i16(ptr addrspace(8) %ptr, i32 0, i32 0, i32 1) + %bitcast = bitcast <2 x i16> %load to i32 + %cmp = icmp eq i32 %bitcast, %id + br i1 %cmp, label %bb1, label %bb2 +bb2: + ret void +} + +define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) %ptr) { +; CHECK-LABEL: raw_ptr_atomic_buffer_load_v4i16: +; CHECK: ; %bb.0: ; %bb +; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: .LBB7_1: ; %bb1 +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; CHECK-NEXT: v_lshl_or_b32 v1, v2, 16, v1 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; CHECK-NEXT: s_cbranch_execnz .LBB7_1 +; CHECK-NEXT: ; %bb.2: ; %bb2 +; CHECK-NEXT: s_endpgm +bb: + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + br label %bb1 +bb1: + %load = call <4 x i16> @llvm.amdgcn.raw.ptr.atomic.buffer.load.v4i16(ptr addrspace(8) %ptr, i32 4, i32 0, i32 1) + %shortened = shufflevector <4 x i16> %load, <4 x i16> poison, <2 x i32> + %bitcast = bitcast <2 x i16> %shortened to i32 + %cmp = icmp eq i32 %bitcast, %id + br i1 %cmp, label %bb1, label %bb2 +bb2: + ret void +} + +define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v4i32(ptr addrspace(8) %ptr) { +; CHECK-LABEL: raw_ptr_atomic_buffer_load_v4i32: +; CHECK: ; %bb.0: ; %bb +; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: .LBB8_1: ; %bb1 +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_load_b128 v[1:4], off, s[0:3], 0 offset:4 glc +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v4, v0 +; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; CHECK-NEXT: s_cbranch_execnz .LBB8_1 +; CHECK-NEXT: ; %bb.2: ; %bb2 +; CHECK-NEXT: s_endpgm +bb: + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + br label %bb1 +bb1: + %load = call <4 x i32> @llvm.amdgcn.raw.ptr.atomic.buffer.load.v4i32(ptr addrspace(8) %ptr, i32 4, i32 0, i32 1) + %extracted = extractelement <4 x i32> %load, i32 3 + %cmp = icmp eq i32 %extracted, %id + br i1 %cmp, label %bb1, label %bb2 +bb2: + ret void +} + +define amdgpu_kernel void @raw_ptr_atomic_buffer_load_ptr(ptr addrspace(8) %ptr) { +; CHECK-LABEL: raw_ptr_atomic_buffer_load_ptr: +; CHECK: ; %bb.0: ; %bb +; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: .LBB9_1: ; %bb1 +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: flat_load_b32 v1, v[1:2] +; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 +; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 +; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 +; CHECK-NEXT: s_cbranch_execnz .LBB9_1 +; CHECK-NEXT: ; %bb.2: ; %bb2 +; CHECK-NEXT: s_endpgm +bb: + %id = tail call i32 @llvm.amdgcn.workitem.id.x() + br label %bb1 +bb1: + %load = call ptr @llvm.amdgcn.raw.ptr.atomic.buffer.load.ptr(ptr addrspace(8) %ptr, i32 4, i32 0, i32 1) + %elem = load i32, ptr %load + %cmp = icmp eq i32 %elem, %id + br i1 %cmp, label %bb1, label %bb2 +bb2: + ret void +} + +; Function Attrs: nounwind readonly +declare i32 @llvm.amdgcn.raw.ptr.atom.buffer.load.i32(ptr addrspace(8), i32, i32, i32 immarg) +declare i64 @llvm.amdgcn.raw.ptr.atom.buffer.load.i64(ptr addrspace(8), i32, i32, i32 immarg) +declare <2 x i16> @llvm.amdgcn.raw.ptr.atom.buffer.load.v2i16(ptr addrspace(8), i32, i32, i32 immarg) +declare <4 x i16> @llvm.amdgcn.raw.ptr.atom.buffer.load.v4i16(ptr addrspace(8), i32, i32, i32 immarg) +declare <4 x i32> @llvm.amdgcn.raw.ptr.atom.buffer.load.v4i32(ptr addrspace(8), i32, i32, i32 immarg) +declare ptr @llvm.amdgcn.raw.ptr.atom.buffer.load.ptr(ptr addrspace(8), i32, i32, i32 immarg) +declare i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8), i32, i32, i32 immarg) +declare i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-memops.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-memops.ll index 66c68f7cc731ea..57028a0f9b14f3 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-memops.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-memops.ll @@ -17,10 +17,10 @@ define void @loads(ptr addrspace(8) %buf) { ; CHECK-NEXT: [[VOLATILE:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 -2147483648) ; CHECK-NEXT: [[VOLATILE_NONTEMPORAL:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 -2147483646), !nontemporal [[META0]] ; CHECK-NEXT: fence syncscope("wavefront") release -; CHECK-NEXT: [[ATOMIC:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 -2147483647) +; CHECK-NEXT: [[ATOMIC:%.*]] = call float @llvm.amdgcn.raw.ptr.atomic.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 -2147483647) ; CHECK-NEXT: fence syncscope("wavefront") acquire -; CHECK-NEXT: [[ATOMIC_MONOTONIC:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 1) -; CHECK-NEXT: [[ATOMIC_ACQUIRE:%.*]] = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 1) +; CHECK-NEXT: [[ATOMIC_MONOTONIC:%.*]] = call float @llvm.amdgcn.raw.ptr.atomic.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 1) +; CHECK-NEXT: [[ATOMIC_ACQUIRE:%.*]] = call float @llvm.amdgcn.raw.ptr.atomic.buffer.load.f32(ptr addrspace(8) align 4 [[BUF]], i32 16, i32 0, i32 1) ; CHECK-NEXT: fence acquire ; CHECK-NEXT: ret void ;