Skip to content

[NVPTX] Add support for Shared Cluster Memory address space [2/2] #136768

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Apr 22, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion clang/test/CodeGenCUDA/builtins-sm90.cu
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ __attribute__((global)) void kernel(long *out, void *ptr, unsigned u) {
auto * sptr = (__attribute__((address_space(3))) void *)ptr;
// CHECK: call ptr @llvm.nvvm.mapa(ptr %{{.*}}, i32 %{{.*}})
out[i++] = (long) __nvvm_mapa(ptr, u);
// CHECK: call ptr addrspace(3) @llvm.nvvm.mapa.shared.cluster(ptr addrspace(3) %{{.*}}, i32 %{{.*}})
// CHECK: call ptr addrspace(7) @llvm.nvvm.mapa.shared.cluster(ptr addrspace(3) %{{.*}}, i32 %{{.*}})
out[i++] = (long) __nvvm_mapa_shared_cluster(sptr, u);
// CHECK: call i32 @llvm.nvvm.getctarank(ptr {{.*}})
out[i++] = __nvvm_getctarank(ptr);
Expand Down
33 changes: 30 additions & 3 deletions llvm/docs/NVPTXUsage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ The NVPTX back-end uses the following address space mapping:
3 Shared
4 Constant
5 Local
7 Shared Cluster
============= ======================

Every global variable and pointer type is assigned to one of these address
Expand Down Expand Up @@ -306,6 +307,32 @@ If the given pointer in the generic address space refers to memory which falls
within the state space of the intrinsic (and therefore could be safely address
space casted to this space), 1 is returned, otherwise 0 is returned.

'``llvm.nvvm.mapa.*``' Intrinsics
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Syntax:
"""""""

.. code-block:: llvm

declare ptr @llvm.nvvm.mapa(ptr %p, i32 %rank)
declare ptr addrspace(7) @llvm.nvvm.mapa.shared.cluster(ptr addrspace(3) %p, i32 %rank)

Overview:
"""""""""

The '``llvm.nvvm.mapa.*``' intrinsics map a shared memory pointer ``p`` of another CTA with ``%rank`` to the current CTA.
The ``llvm.nvvm.mapa`` form expects a generic pointer to shared memory and returns a generic pointer to shared cluster memory.
The ``llvm.nvvm.mapa.shared.cluster`` form expects a pointer to shared memory and returns a pointer to shared cluster memory.
They corresponds directly to the ``mapa`` and ``mapa.shared.cluster`` PTX instructions.

Semantics:
""""""""""

If the given pointer in the generic address space refers to memory which falls
within the state space of the intrinsic (and therefore could be safely address
space casted to this space), 1 is returned, otherwise 0 is returned.

Arithmetic Intrinsics
---------------------

Expand Down Expand Up @@ -552,7 +579,7 @@ Syntax:

.. code-block:: llvm

declare void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(3) %dst, ptr addrspace(3) %mbar, ptr addrspace(1) %src, i32 %size, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch)
declare void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(7) %dst, ptr addrspace(3) %mbar, ptr addrspace(1) %src, i32 %size, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch)

Overview:
"""""""""
Expand Down Expand Up @@ -616,7 +643,7 @@ Syntax:

.. code-block:: llvm

declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(3) %dst, ptr addrspace(3) %mbar, ptr addrspace(3) %src, i32 %size)
declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(7) %dst, ptr addrspace(3) %mbar, ptr addrspace(3) %src, i32 %size)

Overview:
"""""""""
Expand Down Expand Up @@ -771,7 +798,7 @@ Syntax:

.. code-block:: llvm

declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch)
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch)
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(..., i32 %d0, i32 %d1, ...)
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(..., i32 %d0, i32 %d1, i32 %d2, ...)
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
Expand Down
53 changes: 27 additions & 26 deletions llvm/include/llvm/IR/IntrinsicsNVVM.td
Original file line number Diff line number Diff line change
Expand Up @@ -127,10 +127,11 @@
// * llvm.nvvm.atomic.load.inc.32 --> atomicrmw uinc_wrap
// * llvm.nvvm.atomic.load.dec.32 --> atomicrmw udec_wrap

def llvm_global_ptr_ty : LLVMQualPointerType<1>; // (global)ptr
def llvm_shared_ptr_ty : LLVMQualPointerType<3>; // (shared)ptr
def llvm_local_ptr_ty : LLVMQualPointerType<5>; // (local)ptr
def llvm_tmem_ptr_ty : LLVMQualPointerType<6>; // (tensor memory)ptr
def llvm_global_ptr_ty : LLVMQualPointerType<1>; // (global)ptr
def llvm_shared_ptr_ty : LLVMQualPointerType<3>; // (shared)ptr
def llvm_local_ptr_ty : LLVMQualPointerType<5>; // (local)ptr
def llvm_tmem_ptr_ty : LLVMQualPointerType<6>; // (tensor memory)ptr
def llvm_shared_cluster_ptr_ty : LLVMQualPointerType<7>; // (shared_cluster)ptr

//
// MISC
Expand Down Expand Up @@ -691,15 +692,15 @@ class CP_ASYNC_BULK_TENSOR_G2S_INTR<int dim, string mode> {
list<LLVMType> Im2ColOffsetsTy = !listsplat(llvm_i16_ty, NumIm2ColOffsets);
list<LLVMType> TensorDimsTy = !listsplat(llvm_i32_ty, dim);
list<LLVMType> ArgsTy = !listconcat(
[llvm_shared_ptr_ty, // dst_smem_ptr
llvm_shared_ptr_ty, // mbarrier_smem_ptr
llvm_ptr_ty], // tensormap_ptr
TensorDimsTy, // actual tensor dims
Im2ColOffsetsTy, // im2col offsets
[llvm_i16_ty, // cta_mask
llvm_i64_ty, // cache_hint
llvm_i1_ty, // Flag for cta_mask
llvm_i1_ty] // Flag for cache_hint
[llvm_shared_cluster_ptr_ty, // dst_shared_cluster_ptr
llvm_shared_ptr_ty, // mbarrier_smem_ptr
llvm_ptr_ty], // tensormap_ptr
TensorDimsTy, // actual tensor dims
Im2ColOffsetsTy, // im2col offsets
[llvm_i16_ty, // cta_mask
llvm_i64_ty, // cache_hint
llvm_i1_ty, // Flag for cta_mask
llvm_i1_ty] // Flag for cache_hint
);

int TempFlagsStartIdx = !add(dim, 5);
Expand Down Expand Up @@ -5134,7 +5135,7 @@ def int_nvvm_mapa
[IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>],
"llvm.nvvm.mapa">;
def int_nvvm_mapa_shared_cluster
: DefaultAttrsIntrinsic<[llvm_shared_ptr_ty], [llvm_shared_ptr_ty, llvm_i32_ty],
: DefaultAttrsIntrinsic<[llvm_shared_cluster_ptr_ty], [llvm_shared_ptr_ty, llvm_i32_ty],
[IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>],
"llvm.nvvm.mapa.shared.cluster">;
def int_nvvm_getctarank
Expand Down Expand Up @@ -5234,14 +5235,14 @@ def int_nvvm_discard_L2 : DefaultAttrsIntrinsic<[],
// From Global to Shared Cluster
def int_nvvm_cp_async_bulk_global_to_shared_cluster
: DefaultAttrsIntrinsic<[],
[llvm_shared_ptr_ty, // dst_smem_ptr
llvm_shared_ptr_ty, // mbarrier_ptr
llvm_global_ptr_ty, // src_gmem_ptr
llvm_i32_ty, // copy_size
llvm_i16_ty, // cta_mask
llvm_i64_ty, // cache_hint
llvm_i1_ty, // Flag for cta_mask
llvm_i1_ty], // Flag for cache_hint
[llvm_shared_cluster_ptr_ty, // dst_shared_cluster_ptr
llvm_shared_ptr_ty, // mbarrier_ptr
llvm_global_ptr_ty, // src_gmem_ptr
llvm_i32_ty, // copy_size
llvm_i16_ty, // cta_mask
llvm_i64_ty, // cache_hint
llvm_i1_ty, // Flag for cta_mask
llvm_i1_ty], // Flag for cache_hint
[IntrConvergent, IntrArgMemOnly,
WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<2>>,
NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
Expand All @@ -5251,10 +5252,10 @@ def int_nvvm_cp_async_bulk_global_to_shared_cluster
// From Shared CTA to Shared Cluster
def int_nvvm_cp_async_bulk_shared_cta_to_cluster
: DefaultAttrsIntrinsic<[],
[llvm_shared_ptr_ty, // dst_smem_ptr
llvm_shared_ptr_ty, // mbarrier_ptr
llvm_shared_ptr_ty, // src_smem_ptr
llvm_i32_ty], // copy_size
[llvm_shared_cluster_ptr_ty, // dst_shared_cluster_ptr
llvm_shared_ptr_ty, // mbarrier_ptr
llvm_shared_ptr_ty, // src_smem_ptr
llvm_i32_ty], // copy_size
[IntrConvergent, IntrArgMemOnly,
WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<2>>,
NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
Expand Down
83 changes: 83 additions & 0 deletions llvm/lib/IR/AutoUpgrade.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
#include "llvm/Support/AMDGPUAddrSpace.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/NVPTXAddrSpace.h"
#include "llvm/Support/Regex.h"
#include "llvm/TargetParser/Triple.h"
#include <cstdint>
Expand Down Expand Up @@ -938,6 +939,47 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F,
return false; // No other 'arm.*', 'aarch64.*'.
}

static Intrinsic::ID shouldUpgradeNVPTXSharedClusterIntrinsic(Function *F,
StringRef Name) {
if (Name.consume_front("mapa.shared.cluster"))
if (F->getReturnType()->getPointerAddressSpace() ==
NVPTXAS::ADDRESS_SPACE_SHARED)
return Intrinsic::nvvm_mapa_shared_cluster;

if (Name.consume_front("cp.async.bulk.")) {
Intrinsic::ID ID =
StringSwitch<Intrinsic::ID>(Name)
.Case("global.to.shared.cluster",
Intrinsic::nvvm_cp_async_bulk_global_to_shared_cluster)
.Case("shared.cta.to.cluster",
Intrinsic::nvvm_cp_async_bulk_shared_cta_to_cluster)
.Case("tensor.g2s.im2col.3d",
Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d)
.Case("tensor.g2s.im2col.4d",
Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d)
.Case("tensor.g2s.im2col.5d",
Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d)
.Case("tensor.g2s.tile.1d",
Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d)
.Case("tensor.g2s.tile.2d",
Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d)
.Case("tensor.g2s.tile.3d",
Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d)
.Case("tensor.g2s.tile.4d",
Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d)
.Case("tensor.g2s.tile.5d",
Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d)
.Default(Intrinsic::not_intrinsic);

if (ID != Intrinsic::not_intrinsic)
if (F->getArg(0)->getType()->getPointerAddressSpace() ==
NVPTXAS::ADDRESS_SPACE_SHARED)
return ID;
}

return Intrinsic::not_intrinsic;
}

static Intrinsic::ID shouldUpgradeNVPTXBF16Intrinsic(StringRef Name) {
if (Name.consume_front("fma.rn."))
return StringSwitch<Intrinsic::ID>(Name)
Expand Down Expand Up @@ -1278,6 +1320,14 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
}
}

// Upgrade Distributed Shared Memory Intrinsics
Intrinsic::ID IID = shouldUpgradeNVPTXSharedClusterIntrinsic(F, Name);
if (IID != Intrinsic::not_intrinsic) {
rename(F);
NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), IID);
return true;
}

// The following nvvm intrinsics correspond exactly to an LLVM idiom, but
// not to an intrinsic alone. We expand them in UpgradeIntrinsicCall.
//
Expand Down Expand Up @@ -4718,6 +4768,39 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
CI->eraseFromParent();
return;
}
case Intrinsic::nvvm_mapa_shared_cluster: {
// Create a new call with the correct address space.
NewCall =
Builder.CreateCall(NewFn, {CI->getArgOperand(0), CI->getArgOperand(1)});
Value *Res = NewCall;
Res = Builder.CreateAddrSpaceCast(
Res, Builder.getPtrTy(NVPTXAS::ADDRESS_SPACE_SHARED));
NewCall->takeName(CI);
CI->replaceAllUsesWith(Res);
CI->eraseFromParent();
return;
}
case Intrinsic::nvvm_cp_async_bulk_global_to_shared_cluster:
case Intrinsic::nvvm_cp_async_bulk_shared_cta_to_cluster:
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d:
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d:
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d:
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d:
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d:
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d: {
// Create a new call with the correct address space.
SmallVector<Value *, 4> Args(CI->args());
Args[0] = Builder.CreateAddrSpaceCast(
Args[0], Builder.getPtrTy(NVPTXAS::ADDRESS_SPACE_SHARED_CLUSTER));

NewCall = Builder.CreateCall(NewFn, Args);
NewCall->takeName(CI);
CI->replaceAllUsesWith(NewCall);
CI->eraseFromParent();
return;
}
case Intrinsic::riscv_sha256sig0:
case Intrinsic::riscv_sha256sig1:
case Intrinsic::riscv_sha256sum0:
Expand Down
57 changes: 57 additions & 0 deletions llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,21 @@ declare float @llvm.nvvm.ldg.global.f.f32.p0(ptr, i32)
declare i32 @llvm.nvvm.atomic.load.inc.32(ptr, i32)
declare i32 @llvm.nvvm.atomic.load.dec.32(ptr, i32)

declare ptr addrspace(3) @llvm.nvvm.mapa.shared.cluster(ptr addrspace(3), i32)

declare void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(3), ptr addrspace(3), ptr addrspace(1), i32, i16, i64, i1, i1)
declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(3), ptr addrspace(3), ptr addrspace(3), i32)

declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i16 %mc, i64 %ch, i1 %f1, i1 %f2);
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i16 %mc, i64 %ch, i1 %f1, i1 %f2);
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch, i1 %f1, i1 %f2);
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch, i1 %f1, i1 %f2);
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 %f1, i1 %f2);

declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 %f1, i1 %f2);
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch, i1 %f1, i1 %f2);
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch, i1 %f1, i1 %f2);

; CHECK-LABEL: @simple_upgrade
define void @simple_upgrade(i32 %a, i64 %b, i16 %c) {
; CHECK: call i32 @llvm.bitreverse.i32(i32 %a)
Expand Down Expand Up @@ -254,3 +269,45 @@ define i32 @atomics(ptr %p0, i32 %a) {
ret i32 %r2
}

; CHECK-LABEL: @nvvm_shared_cluster_intrinsics
define void @nvvm_shared_cluster_intrinsics(ptr addrspace(3) %p0, i32 %offset) {
; CHECK: %r = call ptr addrspace(7) @llvm.nvvm.mapa.shared.cluster(ptr addrspace(3) %p0, i32 %offset)
%r = call ptr addrspace(3) @llvm.nvvm.mapa.shared.cluster(ptr addrspace(3) %p0, i32 %offset)
ret void
}

; CHECK-LABEL: @nvvm_cp_async_bulk_intrinsics
define void @nvvm_cp_async_bulk_intrinsics(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr addrspace(1) %src, ptr addrspace(3) %src_shared, i32 %size) {
; CHECK: call void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(7) %1, ptr addrspace(3) %bar, ptr addrspace(1) %src, i32 %size, i16 0, i64 0, i1 false, i1 false)
; CHECK: call void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(7) %2, ptr addrspace(3) %bar, ptr addrspace(3) %src_shared, i32 %size)
call void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr addrspace(1) %src, i32 %size, i16 0, i64 0, i1 false, i1 false)
call void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr addrspace(3) %src_shared, i32 %size)
ret void
}

; CHECK-LABEL: @nvvm_cp_async_bulk_tensor_g2s_im2col
define void @nvvm_cp_async_bulk_tensor_g2s_im2col(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch) {
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %1, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 0, i64 0, i1 false, i1 false)
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %2, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 0, i64 0, i1 false, i1 false)
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %3, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 0, i64 0, i1 false, i1 false)
call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 0, i64 0, i1 0, i1 0)
call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 0, i64 0, i1 0, i1 0)
call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 0, i64 0, i1 0, i1 0)
ret void
}

; CHECK-LABEL: @nvvm_cp_async_bulk_tensor_g2s_tile
define void @nvvm_cp_async_bulk_tensor_g2s_tile(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch) {
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %1, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 0, i64 0, i1 false, i1 false)
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %2, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 0, i64 0, i1 false, i1 false)
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %3, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 0, i64 0, i1 false, i1 false)
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %4, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 0, i64 0, i1 false, i1 false)
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %5, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 0, i64 0, i1 false, i1 false)
call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 0, i64 0, i1 0, i1 0)
call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 0, i64 0, i1 0, i1 0)
call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 0, i64 0, i1 0, i1 0)
call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 0, i64 0, i1 0, i1 0)
call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 0, i64 0, i1 0, i1 0)
ret void
}

Loading
Loading