diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index 313e84f3722a95a..25a230f65fd3ddc 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -465,6 +465,94 @@ least-significant bit position. 0xffffffff is returned if no 1 bit is found. TMA family of Intrinsics ------------------------ +'``llvm.nvvm.cp.async.bulk.global.to.shared.cluster``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(3) %dst, ptr addrspace(3) %mbar, ptr addrspace(1) %src, i32 %size, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch) + +Overview: +""""""""" + +The '``@llvm.nvvm.cp.async.bulk.global.to.shared.cluster``' intrinsic +corresponds to the ``cp.async.bulk.shared::cluster.global.*`` family +of PTX instructions. These instructions initiate an asynchronous +copy of bulk data from global memory to shared::cluster memory. +The 32-bit operand ``%size`` specifies the amount of memory to be +copied and it must be a multiple of 16. + +* The last two arguments to these intrinsics are boolean flags + indicating support for cache_hint and/or multicast modifiers. + These flag arguments must be compile-time constants. The backend + looks through these flags and lowers the intrinsics appropriately. + +* The Nth argument (denoted by ``i1 %flag_ch``) when set, indicates + a valid cache_hint (``i64 %ch``) and generates the ``.L2::cache_hint`` + variant of the PTX instruction. + +* The [N-1]th argument (denoted by ``i1 %flag_mc``) when set, indicates + the presence of a multicast mask (``i16 %mc``) and generates the PTX + instruction with the ``.multicast::cluster`` modifier. + +For more information, refer PTX ISA +``_. + +'``llvm.nvvm.cp.async.bulk.shared.cta.to.global``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.global(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i64 %ch, i1 %flag_ch) + +Overview: +""""""""" + +The '``@llvm.nvvm.cp.async.bulk.shared.cta.to.global``' intrinsic +corresponds to the ``cp.async.bulk.global.shared::cta.*`` set of PTX +instructions. These instructions initiate an asynchronous copy from +shared::cta to global memory. The 32-bit operand ``%size`` specifies +the amount of memory to be copied and it must be a multiple of 16. + +* The last argument to these intrinsics is a boolean flag + indicating support for cache_hint. This flag argument must + be a compile-time constant. When set, it indicates a valid + cache_hint (``i64 %ch``) and generates the ``.L2::cache_hint`` + variant of the PTX instruction. + +For more information, refer PTX ISA +``_. + +'``llvm.nvvm.cp.async.bulk.shared.cta.to.cluster``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(3) %dst, ptr addrspace(3) %mbar, ptr addrspace(3) %src, i32 %size) + +Overview: +""""""""" + +The '``@llvm.nvvm.cp.async.bulk.shared.cta.to.cluster``' intrinsic +corresponds to the ``cp.async.bulk.shared::cluster.shared::cta.*`` +PTX instruction. This instruction initiates an asynchronous copy from +shared::cta to shared::cluster memory. The destination has to be in +the shared memory of a different CTA within the cluster. The 32-bit +operand ``%size`` specifies the amount of memory to be copied and +it must be a multiple of 16. + +For more information, refer PTX ISA +``_. + '``llvm.nvvm.cp.async.bulk.tensor.g2s.tile.[1-5]d``' ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index fd07d131ce15b24..ae04a130bc82546 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -4980,4 +4980,47 @@ foreach dim = [1, 2, 3, 4, 5] in { } } +// Intrinsics for Bulk Copy using TMA (non-tensor) +// From Global to Shared Cluster +def int_nvvm_cp_async_bulk_global_to_shared_cluster + : DefaultAttrsIntrinsic<[], + [llvm_shared_ptr_ty, // dst_smem_ptr + llvm_shared_ptr_ty, // mbarrier_ptr + llvm_global_ptr_ty, // src_gmem_ptr + llvm_i32_ty, // copy_size + llvm_i16_ty, // cta_mask + llvm_i64_ty, // cache_hint + llvm_i1_ty, // Flag for cta_mask + llvm_i1_ty], // Flag for cache_hint + [IntrConvergent, IntrArgMemOnly, + WriteOnly>, ReadOnly>, + NoCapture>, NoCapture>, + NoCapture>, ImmArg>, + ImmArg>]>; + +// From Shared CTA to Shared Cluster +def int_nvvm_cp_async_bulk_shared_cta_to_cluster + : DefaultAttrsIntrinsic<[], + [llvm_shared_ptr_ty, // dst_smem_ptr + llvm_shared_ptr_ty, // mbarrier_ptr + llvm_shared_ptr_ty, // src_smem_ptr + llvm_i32_ty], // copy_size + [IntrConvergent, IntrArgMemOnly, + WriteOnly>, ReadOnly>, + NoCapture>, NoCapture>, + NoCapture>]>; + +// From Shared CTA to Global memory +def int_nvvm_cp_async_bulk_shared_cta_to_global + : DefaultAttrsIntrinsic<[], + [llvm_global_ptr_ty, // dst_gmem_ptr + llvm_shared_ptr_ty, // src_smem_ptr + llvm_i32_ty, // copy_size + llvm_i64_ty, // cache_hint + llvm_i1_ty], // Flag for cache_hint + [IntrConvergent, IntrArgMemOnly, + WriteOnly>, ReadOnly>, + NoCapture>, NoCapture>, + ImmArg>]>; + } // let TargetPrefix = "nvvm" diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index ef97844142d403b..1341f8a8fca1fbc 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -3024,6 +3024,75 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(SDNode *N, ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops)); } +void NVPTXDAGToDAGISel::SelectCpAsyncBulkS2G(SDNode *N) { + // We have {Chain, Intrinsic-ID} followed by the actual intrisic args: + // dst, src, size, cache_hint, cache_hint_flag + // NumOperands = {Chain, IID} + {Actual intrinsic args} + // = {2} + {5} + size_t NumOps = N->getNumOperands(); + bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1; + size_t NumArgs = IsCacheHint ? 4 : 3; // src, dst, size, cache_hint + + SDLoc DL(N); + SmallVector Ops(N->ops().slice(2, NumArgs)); + Ops.push_back(N->getOperand(0)); // Chain operand + + bool IsShared32 = + CurDAG->getDataLayout().getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32; + unsigned Opcode; + if (IsCacheHint) + Opcode = IsShared32 ? NVPTX::CP_ASYNC_BULK_S2G_SHARED32_CH + : NVPTX::CP_ASYNC_BULK_S2G_CH; + else + Opcode = IsShared32 ? NVPTX::CP_ASYNC_BULK_S2G_SHARED32 + : NVPTX::CP_ASYNC_BULK_S2G; + ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops)); +} + +void NVPTXDAGToDAGISel::SelectCpAsyncBulkG2S(SDNode *N) { + // We have {Chain, Intrinsic-ID} followed by the actual intrisic args: + // {dst, mbar, src, size, multicast, cache_hint, + // multicast_flag, cache_hint_flag} + // NumOperands = {Chain, IID} + {Actual intrinsic args} + // = {2} + {8} + size_t NumOps = N->getNumOperands(); + bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1; + bool IsMultiCast = N->getConstantOperandVal(NumOps - 2) == 1; + size_t NumBaseArgs = 4; // dst, mbar, src, size + size_t MultiCastIdx = NumBaseArgs + 2; // for Chain and IID + + SDLoc DL(N); + SmallVector Ops(N->ops().slice(2, NumBaseArgs)); + + // Push MultiCast operand, if available + if (IsMultiCast) + Ops.push_back(N->getOperand(MultiCastIdx)); + + // Push CacheHint operand, if available + if (IsCacheHint) + Ops.push_back(N->getOperand(MultiCastIdx + 1)); + + // Finally, the chain operand + Ops.push_back(N->getOperand(0)); + + bool IsShared32 = + CurDAG->getDataLayout().getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32; + unsigned Opcode = [&]() { + if (IsMultiCast && IsCacheHint) + return IsShared32 ? NVPTX::CP_ASYNC_BULK_G2S_SHARED32_MC_CH + : NVPTX::CP_ASYNC_BULK_G2S_MC_CH; + if (IsMultiCast) + return IsShared32 ? NVPTX::CP_ASYNC_BULK_G2S_SHARED32_MC + : NVPTX::CP_ASYNC_BULK_G2S_MC; + if (IsCacheHint) + return IsShared32 ? NVPTX::CP_ASYNC_BULK_G2S_SHARED32_CH + : NVPTX::CP_ASYNC_BULK_G2S_CH; + return IsShared32 ? NVPTX::CP_ASYNC_BULK_G2S_SHARED32 + : NVPTX::CP_ASYNC_BULK_G2S; + }(); + ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops)); +} + bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) { unsigned IID = N->getConstantOperandVal(1); using TMARedTy = llvm::nvvm::TMAReductionOp; @@ -3031,6 +3100,12 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) { switch (IID) { default: return false; + case Intrinsic::nvvm_cp_async_bulk_global_to_shared_cluster: + SelectCpAsyncBulkG2S(N); + return true; + case Intrinsic::nvvm_cp_async_bulk_shared_cta_to_global: + SelectCpAsyncBulkS2G(N); + return true; case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_1d: case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_2d: case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_3d: diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index c307f28fcc6c0a2..4b67a370c3fe09c 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -90,6 +90,8 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel { bool tryEXTRACT_VECTOR_ELEMENT(SDNode *N); void SelectV2I64toI128(SDNode *N); void SelectI128toV2I64(SDNode *N); + void SelectCpAsyncBulkG2S(SDNode *N); + void SelectCpAsyncBulkS2G(SDNode *N); void SelectCpAsyncBulkTensorG2SCommon(SDNode *N, bool IsIm2Col = false); void SelectCpAsyncBulkTensorS2GCommon(SDNode *N, bool IsIm2Col = false); void SelectCpAsyncBulkTensorPrefetchCommon(SDNode *N, bool IsIm2Col = false); diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 8ede1ec4f20dc97..22339ebc5484f1f 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -498,9 +498,71 @@ def CP_ASYNC_BULK_WAIT_GROUP_READ : [(int_nvvm_cp_async_bulk_wait_group_read timm:$n)]>, Requires<[hasPTX<80>, hasSM<90>]>; -//----------------------------------- -// TMA Async Tensor Copy Functions -//----------------------------------- +//------------------------------ +// TMA Async Bulk Copy Functions +//------------------------------ + +class CpAsyncBulkStr { + // Shared to Global memory + string S2G = "cp.async.bulk.global.shared::cta.bulk_group" + # !if(ch, ".L2::cache_hint", ""); + + // Global to Shared cluster memory + string G2S = "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes" + # !if(mc, ".multicast::cluster", "") + # !if(ch, ".L2::cache_hint", ""); + + // Shared CTA to Cluster memory + string C2C = "cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes"; +} + +multiclass CP_ASYNC_BULK_S2G { + def NAME: NVPTXInst<(outs), + (ins Int64Regs:$dst, rc:$src, Int32Regs:$size), + !strconcat(CpAsyncBulkStr<0, 0>.S2G, " [$dst], [$src], $size;"), []>, + Requires<[hasPTX<80>, hasSM<90>]>; + def NAME # _CH: NVPTXInst<(outs), + (ins Int64Regs:$dst, rc:$src, Int32Regs:$size, Int64Regs:$ch), + !strconcat(CpAsyncBulkStr<0, 1>.S2G, " [$dst], [$src], $size, $ch;"), []>, + Requires<[hasPTX<80>, hasSM<90>]>; +} +defm CP_ASYNC_BULK_S2G : CP_ASYNC_BULK_S2G; +defm CP_ASYNC_BULK_S2G_SHARED32 : CP_ASYNC_BULK_S2G; + +multiclass CP_ASYNC_BULK_G2S { + def NAME: NVPTXInst<(outs), + (ins rc:$dst, rc:$mbar, Int64Regs:$src, Int32Regs:$size), + !strconcat(CpAsyncBulkStr<0, 0>.G2S, " [$dst], [$src], $size, [$mbar];"), []>, + Requires<[hasPTX<80>, hasSM<90>]>; + def NAME # _MC: NVPTXInst<(outs), + (ins rc:$dst, rc:$mbar, Int64Regs:$src, Int32Regs:$size, Int16Regs:$mc), + !strconcat(CpAsyncBulkStr<1, 0>.G2S, " [$dst], [$src], $size, [$mbar], $mc;"), []>, + Requires<[hasPTX<80>, hasSM<90>]>; + def NAME # _CH: NVPTXInst<(outs), + (ins rc:$dst, rc:$mbar, Int64Regs:$src, Int32Regs:$size, Int64Regs:$ch), + !strconcat(CpAsyncBulkStr<0, 1>.G2S, " [$dst], [$src], $size, [$mbar], $ch;"), []>, + Requires<[hasPTX<80>, hasSM<90>]>; + def NAME # _MC_CH: NVPTXInst<(outs), + (ins rc:$dst, rc:$mbar, Int64Regs:$src, Int32Regs:$size, Int16Regs:$mc, Int64Regs:$ch), + !strconcat(CpAsyncBulkStr<1, 1>.G2S, " [$dst], [$src], $size, [$mbar], $mc, $ch;"), []>, + Requires<[hasPTX<80>, hasSM<90>]>; +} +defm CP_ASYNC_BULK_G2S : CP_ASYNC_BULK_G2S; +defm CP_ASYNC_BULK_G2S_SHARED32 : CP_ASYNC_BULK_G2S; + +multiclass CP_ASYNC_BULK_CTA_TO_CLUSTER { + def NAME: NVPTXInst<(outs), + (ins rc:$dst, rc:$mbar, rc:$src, Int32Regs:$size), + !strconcat(CpAsyncBulkStr<0, 0>.C2C, " [$dst], [$src], $size, [$mbar];"), + [(int_nvvm_cp_async_bulk_shared_cta_to_cluster rc:$dst, rc:$mbar, rc:$src, Int32Regs:$size)]>, + Requires<[hasPTX<80>, hasSM<90>]>; +} +defm CP_ASYNC_BULK_CTA_TO_CLUSTER : CP_ASYNC_BULK_CTA_TO_CLUSTER; +defm CP_ASYNC_BULK_CTA_TO_CLUSTER_SHARED32 : CP_ASYNC_BULK_CTA_TO_CLUSTER; + +//------------------------------------- +// TMA Async Bulk Tensor Copy Functions +//------------------------------------- // From Global to Shared memory (G2S) class G2S_STRINGS { diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk.ll new file mode 100644 index 000000000000000..aefd18a0632a089 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk.ll @@ -0,0 +1,118 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: %if ptxas-12.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80| %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-12.3 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 --nvptx-short-ptr| %ptxas-verify -arch=sm_90 %} + +target triple = "nvptx64-nvidia-cuda" + +declare void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(3), ptr addrspace(3), ptr addrspace(1), i32, i16, i64, i1, i1) +declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.global(ptr addrspace(1), ptr addrspace(3), i32, i64, i1) +declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(3), ptr addrspace(3), ptr addrspace(3), i32) + +define void @cp_async_bulk_g2s(ptr addrspace(1) %src, ptr addrspace(3) %bar, ptr addrspace(3) %dst, i32 %size, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_g2s( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX64-NEXT: .reg .b32 %r<2>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_g2s_param_0]; +; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_g2s_param_1]; +; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_g2s_param_2]; +; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_g2s_param_3]; +; CHECK-PTX64-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%rd3], [%rd1], %r1, [%rd2]; +; CHECK-PTX64-NEXT: ld.param.u64 %rd4, [cp_async_bulk_g2s_param_5]; +; CHECK-PTX64-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.L2::cache_hint [%rd3], [%rd1], %r1, [%rd2], %rd4; +; CHECK-PTX64-NEXT: ld.param.u16 %rs1, [cp_async_bulk_g2s_param_4]; +; CHECK-PTX64-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [%rd3], [%rd1], %r1, [%rd2], %rs1; +; CHECK-PTX64-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%rd3], [%rd1], %r1, [%rd2], %rs1, %rd4; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_g2s( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_g2s_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_g2s_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_g2s_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_g2s_param_3]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%r2], [%rd1], %r3, [%r1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_g2s_param_5]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.L2::cache_hint [%r2], [%rd1], %r3, [%r1], %rd2; +; CHECK-PTX-SHARED32-NEXT: ld.param.u16 %rs1, [cp_async_bulk_g2s_param_4]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster [%r2], [%rd1], %r3, [%r1], %rs1; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint [%r2], [%rd1], %r3, [%r1], %rs1, %rd2; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr addrspace(1) %src, i32 %size, i16 0, i64 0, i1 0, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr addrspace(1) %src, i32 %size, i16 0, i64 %ch, i1 0, i1 1) + tail call void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr addrspace(1) %src, i32 %size, i16 %mc, i64 0, i1 1, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr addrspace(1) %src, i32 %size, i16 %mc, i64 %ch, i1 1, i1 1) + ret void +} + +define void @cp_async_bulk_s2g(ptr addrspace(3) %src, ptr addrspace(1) %dst, i32 %size, i64 %ch) { +; CHECK-PTX64-LABEL: cp_async_bulk_s2g( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b32 %r<2>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<4>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_s2g_param_0]; +; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_s2g_param_1]; +; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_s2g_param_2]; +; CHECK-PTX64-NEXT: cp.async.bulk.global.shared::cta.bulk_group [%rd2], [%rd1], %r1; +; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_s2g_param_3]; +; CHECK-PTX64-NEXT: cp.async.bulk.global.shared::cta.bulk_group.L2::cache_hint [%rd2], [%rd1], %r1, %rd3; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_s2g( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<3>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_s2g_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd1, [cp_async_bulk_s2g_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_s2g_param_2]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.global.shared::cta.bulk_group [%rd1], [%r1], %r2; +; CHECK-PTX-SHARED32-NEXT: ld.param.u64 %rd2, [cp_async_bulk_s2g_param_3]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.global.shared::cta.bulk_group.L2::cache_hint [%rd1], [%r1], %r2, %rd2; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.shared.cta.to.global(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i64 0, i1 0) + tail call void @llvm.nvvm.cp.async.bulk.shared.cta.to.global(ptr addrspace(1) %dst, ptr addrspace(3) %src, i32 %size, i64 %ch, i1 1) + ret void +} + +define void @cp_async_bulk_cta_to_cluster(ptr addrspace(3) %src, ptr addrspace(3) %bar, ptr addrspace(3) %dst, i32 %size) { +; CHECK-PTX64-LABEL: cp_async_bulk_cta_to_cluster( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b32 %r<2>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<4>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.u64 %rd1, [cp_async_bulk_cta_to_cluster_param_0]; +; CHECK-PTX64-NEXT: ld.param.u64 %rd2, [cp_async_bulk_cta_to_cluster_param_1]; +; CHECK-PTX64-NEXT: ld.param.u64 %rd3, [cp_async_bulk_cta_to_cluster_param_2]; +; CHECK-PTX64-NEXT: ld.param.u32 %r1, [cp_async_bulk_cta_to_cluster_param_3]; +; CHECK-PTX64-NEXT: cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [%rd3], [%rd1], %r1, [%rd2]; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: cp_async_bulk_cta_to_cluster( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<5>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r1, [cp_async_bulk_cta_to_cluster_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r2, [cp_async_bulk_cta_to_cluster_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r3, [cp_async_bulk_cta_to_cluster_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.u32 %r4, [cp_async_bulk_cta_to_cluster_param_3]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.shared::cluster.shared::cta.mbarrier::complete_tx::bytes [%r3], [%r1], %r4, [%r2]; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr addrspace(3) %src, i32 %size) + ret void +}