-
Notifications
You must be signed in to change notification settings - Fork 13.7k
[NVPTX] Add support for Shared Cluster Memory address space [1/2] #135444
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-clang Author: None (modiking) ChangesAdds support for new Distributed Shared Memory Address Space (DSMEM, addrspace 7). See https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#distributed-shared-memory for details.
Patch is 79.43 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/135444.diff 23 Files Affected:
diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp
index 5931a77a85fec..08c8460045c6a 100644
--- a/clang/lib/Basic/Targets/NVPTX.cpp
+++ b/clang/lib/Basic/Targets/NVPTX.cpp
@@ -71,10 +71,11 @@ NVPTXTargetInfo::NVPTXTargetInfo(const llvm::Triple &Triple,
if (TargetPointerWidth == 32)
resetDataLayout(
- "e-p:32:32-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64");
+ "e-p:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64");
else if (Opts.NVPTXUseShortPointers)
- resetDataLayout("e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-i64:64-i128:128-v16:"
- "16-v32:32-n16:32:64");
+ resetDataLayout(
+ "e-p3:32:32-p4:32:32-p5:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:"
+ "16-v32:32-n16:32:64");
else
resetDataLayout("e-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64");
diff --git a/clang/test/CodeGen/target-data.c b/clang/test/CodeGen/target-data.c
index fe29aadb1dd53..9cb00e8ee73d3 100644
--- a/clang/test/CodeGen/target-data.c
+++ b/clang/test/CodeGen/target-data.c
@@ -160,7 +160,7 @@
// RUN: %clang_cc1 -triple nvptx-unknown -o - -emit-llvm %s | \
// RUN: FileCheck %s -check-prefix=NVPTX
-// NVPTX: target datalayout = "e-p:32:32-p6:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+// NVPTX: target datalayout = "e-p:32:32-p6:32:32-p7:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64"
// RUN: %clang_cc1 -triple nvptx64-unknown -o - -emit-llvm %s | \
// RUN: FileCheck %s -check-prefix=NVPTX64
diff --git a/clang/test/CodeGenCUDA/builtins-sm90.cu b/clang/test/CodeGenCUDA/builtins-sm90.cu
index a639c7716adb1..f4746df944536 100644
--- a/clang/test/CodeGenCUDA/builtins-sm90.cu
+++ b/clang/test/CodeGenCUDA/builtins-sm90.cu
@@ -50,7 +50,7 @@ __attribute__((global)) void kernel(long *out, void *ptr, unsigned u) {
auto * sptr = (__attribute__((address_space(3))) void *)ptr;
// CHECK: call ptr @llvm.nvvm.mapa(ptr %{{.*}}, i32 %{{.*}})
out[i++] = (long) __nvvm_mapa(ptr, u);
- // CHECK: call ptr addrspace(3) @llvm.nvvm.mapa.shared.cluster(ptr addrspace(3) %{{.*}}, i32 %{{.*}})
+ // CHECK: call ptr addrspace(7) @llvm.nvvm.mapa.shared.cluster(ptr addrspace(3) %{{.*}}, i32 %{{.*}})
out[i++] = (long) __nvvm_mapa_shared_cluster(sptr, u);
// CHECK: call i32 @llvm.nvvm.getctarank(ptr {{.*}})
out[i++] = __nvvm_getctarank(ptr);
diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index 621879fc5648b..2ce9a4540034c 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -499,7 +499,7 @@ Syntax:
.. code-block:: llvm
- declare void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(3) %dst, ptr addrspace(3) %mbar, ptr addrspace(1) %src, i32 %size, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch)
+ declare void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(7) %dst, ptr addrspace(3) %mbar, ptr addrspace(1) %src, i32 %size, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch)
Overview:
"""""""""
@@ -563,7 +563,7 @@ Syntax:
.. code-block:: llvm
- declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(3) %dst, ptr addrspace(3) %mbar, ptr addrspace(3) %src, i32 %size)
+ declare void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(7) %dst, ptr addrspace(3) %mbar, ptr addrspace(3) %src, i32 %size)
Overview:
"""""""""
@@ -718,7 +718,7 @@ Syntax:
.. code-block:: llvm
- declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch)
+ declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch)
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(..., i32 %d0, i32 %d1, ...)
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(..., i32 %d0, i32 %d1, i32 %d2, ...)
declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 4aeb1d8a2779e..f053fa6e2bf22 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -131,6 +131,7 @@ def llvm_global_ptr_ty : LLVMQualPointerType<1>; // (global)ptr
def llvm_shared_ptr_ty : LLVMQualPointerType<3>; // (shared)ptr
def llvm_local_ptr_ty : LLVMQualPointerType<5>; // (local)ptr
def llvm_tmem_ptr_ty : LLVMQualPointerType<6>; // (tensor memory)ptr
+def llvm_dshared_ptr_ty : LLVMQualPointerType<7>; // (dshared)ptr
//
// MISC
@@ -691,15 +692,15 @@ class CP_ASYNC_BULK_TENSOR_G2S_INTR<int dim, string mode> {
list<LLVMType> Im2ColOffsetsTy = !listsplat(llvm_i16_ty, NumIm2ColOffsets);
list<LLVMType> TensorDimsTy = !listsplat(llvm_i32_ty, dim);
list<LLVMType> ArgsTy = !listconcat(
- [llvm_shared_ptr_ty, // dst_smem_ptr
- llvm_shared_ptr_ty, // mbarrier_smem_ptr
- llvm_ptr_ty], // tensormap_ptr
- TensorDimsTy, // actual tensor dims
- Im2ColOffsetsTy, // im2col offsets
- [llvm_i16_ty, // cta_mask
- llvm_i64_ty, // cache_hint
- llvm_i1_ty, // Flag for cta_mask
- llvm_i1_ty] // Flag for cache_hint
+ [llvm_dshared_ptr_ty, // dst_smem_ptr
+ llvm_shared_ptr_ty, // mbarrier_smem_ptr
+ llvm_ptr_ty], // tensormap_ptr
+ TensorDimsTy, // actual tensor dims
+ Im2ColOffsetsTy, // im2col offsets
+ [llvm_i16_ty, // cta_mask
+ llvm_i64_ty, // cache_hint
+ llvm_i1_ty, // Flag for cta_mask
+ llvm_i1_ty] // Flag for cache_hint
);
int TempFlagsStartIdx = !add(dim, 5);
@@ -5087,7 +5088,7 @@ def int_nvvm_mapa
[IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>],
"llvm.nvvm.mapa">;
def int_nvvm_mapa_shared_cluster
- : DefaultAttrsIntrinsic<[llvm_shared_ptr_ty], [llvm_shared_ptr_ty, llvm_i32_ty],
+ : DefaultAttrsIntrinsic<[llvm_dshared_ptr_ty], [llvm_shared_ptr_ty, llvm_i32_ty],
[IntrNoMem, IntrSpeculatable, NoCapture<ArgIndex<0>>],
"llvm.nvvm.mapa.shared.cluster">;
def int_nvvm_getctarank
@@ -5187,14 +5188,14 @@ def int_nvvm_discard_L2 : DefaultAttrsIntrinsic<[],
// From Global to Shared Cluster
def int_nvvm_cp_async_bulk_global_to_shared_cluster
: DefaultAttrsIntrinsic<[],
- [llvm_shared_ptr_ty, // dst_smem_ptr
- llvm_shared_ptr_ty, // mbarrier_ptr
- llvm_global_ptr_ty, // src_gmem_ptr
- llvm_i32_ty, // copy_size
- llvm_i16_ty, // cta_mask
- llvm_i64_ty, // cache_hint
- llvm_i1_ty, // Flag for cta_mask
- llvm_i1_ty], // Flag for cache_hint
+ [llvm_dshared_ptr_ty, // dst_dsmem_ptr
+ llvm_shared_ptr_ty, // mbarrier_ptr
+ llvm_global_ptr_ty, // src_gmem_ptr
+ llvm_i32_ty, // copy_size
+ llvm_i16_ty, // cta_mask
+ llvm_i64_ty, // cache_hint
+ llvm_i1_ty, // Flag for cta_mask
+ llvm_i1_ty], // Flag for cache_hint
[IntrConvergent, IntrArgMemOnly,
WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<2>>,
NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
@@ -5204,10 +5205,10 @@ def int_nvvm_cp_async_bulk_global_to_shared_cluster
// From Shared CTA to Shared Cluster
def int_nvvm_cp_async_bulk_shared_cta_to_cluster
: DefaultAttrsIntrinsic<[],
- [llvm_shared_ptr_ty, // dst_smem_ptr
- llvm_shared_ptr_ty, // mbarrier_ptr
- llvm_shared_ptr_ty, // src_smem_ptr
- llvm_i32_ty], // copy_size
+ [llvm_dshared_ptr_ty, // dst_dsmem_ptr
+ llvm_shared_ptr_ty, // mbarrier_ptr
+ llvm_shared_ptr_ty, // src_smem_ptr
+ llvm_i32_ty], // copy_size
[IntrConvergent, IntrArgMemOnly,
WriteOnly<ArgIndex<0>>, ReadOnly<ArgIndex<2>>,
NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
diff --git a/llvm/include/llvm/Support/NVPTXAddrSpace.h b/llvm/include/llvm/Support/NVPTXAddrSpace.h
index 486a396621da1..a3eac31f2e5e9 100644
--- a/llvm/include/llvm/Support/NVPTXAddrSpace.h
+++ b/llvm/include/llvm/Support/NVPTXAddrSpace.h
@@ -25,6 +25,7 @@ enum AddressSpace : unsigned {
ADDRESS_SPACE_CONST = 4,
ADDRESS_SPACE_LOCAL = 5,
ADDRESS_SPACE_TENSOR = 6,
+ ADDRESS_SPACE_DSHARED = 7,
ADDRESS_SPACE_PARAM = 101,
};
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 0b329d91c3c7c..7482014d3c168 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -46,6 +46,7 @@
#include "llvm/Support/AMDGPUAddrSpace.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/NVPTXAddrSpace.h"
#include "llvm/Support/Regex.h"
#include "llvm/TargetParser/Triple.h"
#include <cstdint>
@@ -938,6 +939,47 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F,
return false; // No other 'arm.*', 'aarch64.*'.
}
+static Intrinsic::ID shouldUpgradeNVPTXDSharedIntrinsic(Function *F,
+ StringRef Name) {
+ if (Name.consume_front("mapa.shared.cluster"))
+ if (F->getReturnType()->getPointerAddressSpace() ==
+ NVPTXAS::ADDRESS_SPACE_SHARED)
+ return Intrinsic::nvvm_mapa_shared_cluster;
+
+ if (Name.consume_front("cp.async.bulk.")) {
+ Intrinsic::ID ID =
+ StringSwitch<Intrinsic::ID>(Name)
+ .Case("global.to.shared.cluster",
+ Intrinsic::nvvm_cp_async_bulk_global_to_shared_cluster)
+ .Case("shared.cta.to.cluster",
+ Intrinsic::nvvm_cp_async_bulk_shared_cta_to_cluster)
+ .Case("tensor.g2s.im2col.3d",
+ Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d)
+ .Case("tensor.g2s.im2col.4d",
+ Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d)
+ .Case("tensor.g2s.im2col.5d",
+ Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d)
+ .Case("tensor.g2s.tile.1d",
+ Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d)
+ .Case("tensor.g2s.tile.2d",
+ Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d)
+ .Case("tensor.g2s.tile.3d",
+ Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d)
+ .Case("tensor.g2s.tile.4d",
+ Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d)
+ .Case("tensor.g2s.tile.5d",
+ Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d)
+ .Default(Intrinsic::not_intrinsic);
+
+ if (ID != Intrinsic::not_intrinsic)
+ if (F->getArg(0)->getType()->getPointerAddressSpace() ==
+ NVPTXAS::ADDRESS_SPACE_SHARED)
+ return ID;
+ }
+
+ return Intrinsic::not_intrinsic;
+}
+
static Intrinsic::ID shouldUpgradeNVPTXBF16Intrinsic(StringRef Name) {
if (Name.consume_front("abs."))
return StringSwitch<Intrinsic::ID>(Name)
@@ -1284,6 +1326,14 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
}
}
+ // Upgrade Distributed Shared Memory Intrinsics
+ Intrinsic::ID IID = shouldUpgradeNVPTXDSharedIntrinsic(F, Name);
+ if (IID != Intrinsic::not_intrinsic) {
+ rename(F);
+ NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), IID);
+ return true;
+ }
+
// The following nvvm intrinsics correspond exactly to an LLVM idiom, but
// not to an intrinsic alone. We expand them in UpgradeIntrinsicCall.
//
@@ -4704,6 +4754,43 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
CI->eraseFromParent();
return;
}
+ case Intrinsic::nvvm_mapa_shared_cluster: {
+ // Create a new call with the correct address space.
+ NewCall =
+ Builder.CreateCall(NewFn, {CI->getArgOperand(0), CI->getArgOperand(1)});
+ Value *Res = NewCall;
+ Res = Builder.CreateAddrSpaceCast(
+ Res, Builder.getPtrTy(NVPTXAS::ADDRESS_SPACE_GENERIC));
+ Res = Builder.CreateAddrSpaceCast(
+ Res, Builder.getPtrTy(NVPTXAS::ADDRESS_SPACE_SHARED));
+ NewCall->takeName(CI);
+ CI->replaceAllUsesWith(Res);
+ CI->eraseFromParent();
+ return;
+ }
+ case Intrinsic::nvvm_cp_async_bulk_global_to_shared_cluster:
+ case Intrinsic::nvvm_cp_async_bulk_shared_cta_to_cluster:
+ case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
+ case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d:
+ case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
+ case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d:
+ case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d:
+ case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d:
+ case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d:
+ case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d: {
+
+ SmallVector<Value *, 4> Args(CI->args());
+ Args[0] = Builder.CreateAddrSpaceCast(
+ Args[0], Builder.getPtrTy(NVPTXAS::ADDRESS_SPACE_GENERIC));
+ Args[0] = Builder.CreateAddrSpaceCast(
+ Args[0], Builder.getPtrTy(NVPTXAS::ADDRESS_SPACE_DSHARED));
+
+ NewCall = Builder.CreateCall(NewFn, Args);
+ NewCall->takeName(CI);
+ CI->replaceAllUsesWith(NewCall);
+ CI->eraseFromParent();
+ return;
+ }
case Intrinsic::riscv_sha256sig0:
case Intrinsic::riscv_sha256sig1:
case Intrinsic::riscv_sha256sum0:
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
index e42e738b9973f..9ab59c1c144f3 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
@@ -288,6 +288,7 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum,
case NVPTX::AddressSpace::Global:
case NVPTX::AddressSpace::Const:
case NVPTX::AddressSpace::Shared:
+ case NVPTX::AddressSpace::Dshared:
case NVPTX::AddressSpace::Param:
case NVPTX::AddressSpace::Local:
O << "." << A;
diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index 98e77ca80b8d5..c20c522f36bd3 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -176,6 +176,7 @@ enum AddressSpace : AddressSpaceUnderlyingType {
Shared = 3,
Const = 4,
Local = 5,
+ Dshared = 7,
// NVPTX Backend Private:
Param = 101
diff --git a/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.cpp b/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.cpp
index b910ccab21bf3..60bc22f5f589c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAliasAnalysis.cpp
@@ -86,6 +86,11 @@ static AliasResult::Kind getAliasResult(unsigned AS1, unsigned AS2) {
// TODO: cvta.param is not yet supported. We need to change aliasing
// rules once it is added.
+ // Distributed shared memory aliases with shared memory.
+ if (((AS1 == ADDRESS_SPACE_SHARED) && (AS2 == ADDRESS_SPACE_DSHARED)) ||
+ ((AS1 == ADDRESS_SPACE_DSHARED) && (AS2 == ADDRESS_SPACE_SHARED)))
+ return AliasResult::MayAlias;
+
return (AS1 == AS2 ? AliasResult::MayAlias : AliasResult::NoAlias);
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index ec1f969494cd1..34ddfd3c411a8 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -513,6 +513,8 @@ static std::optional<unsigned> convertAS(unsigned AS) {
return NVPTX::AddressSpace::Global;
case llvm::ADDRESS_SPACE_SHARED:
return NVPTX::AddressSpace::Shared;
+ case llvm::ADDRESS_SPACE_DSHARED:
+ return NVPTX::AddressSpace::Dshared;
case llvm::ADDRESS_SPACE_GENERIC:
return NVPTX::AddressSpace::Generic;
case llvm::ADDRESS_SPACE_PARAM:
@@ -658,7 +660,8 @@ getOperationOrderings(MemSDNode *N, const NVPTXSubtarget *Subtarget) {
bool AddrGenericOrGlobalOrShared =
(CodeAddrSpace == NVPTX::AddressSpace::Generic ||
CodeAddrSpace == NVPTX::AddressSpace::Global ||
- CodeAddrSpace == NVPTX::AddressSpace::Shared);
+ CodeAddrSpace == NVPTX::AddressSpace::Shared ||
+ CodeAddrSpace == NVPTX::AddressSpace::Dshared);
if (!AddrGenericOrGlobalOrShared)
return NVPTX::Ordering::NotAtomic;
@@ -979,6 +982,9 @@ void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
case ADDRESS_SPACE_SHARED:
Opc = TM.is64Bit() ? NVPTX::cvta_shared_64 : NVPTX::cvta_shared;
break;
+ case ADDRESS_SPACE_DSHARED:
+ Opc = TM.is64Bit() ? NVPTX::cvta_dshared_64 : NVPTX::cvta_dshared;
+ break;
case ADDRESS_SPACE_CONST:
Opc = TM.is64Bit() ? NVPTX::cvta_const_64 : NVPTX::cvta_const;
break;
@@ -1001,6 +1007,9 @@ void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
case ADDRESS_SPACE_SHARED:
Opc = TM.is64Bit() ? NVPTX::cvta_to_shared_64 : NVPTX::cvta_to_shared;
break;
+ case ADDRESS_SPACE_DSHARED:
+ Opc = TM.is64Bit() ? NVPTX::cvta_to_dshared_64 : NVPTX::cvta_to_dshared;
+ break;
case ADDRESS_SPACE_CONST:
Opc = TM.is64Bit() ? NVPTX::cvta_to_const_64 : NVPTX::cvta_to_const;
break;
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index 16b489afddf5c..4cf5292983048 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -137,6 +137,7 @@ def hasAtomBitwise64 : Predicate<"Subtarget->hasAtomBitwise64()">;
def hasAtomMinMax64 : Predicate<"Subtarget->hasAtomMinMax64()">;
def hasVote : Predicate<"Subtarget->hasVote()">;
def hasDouble : Predicate<"Subtarget->hasDouble()">;
+def hasClusters : Predicate<"Subtarget->hasClusters()">;
def hasLDG : Predicate<"Subtarget->hasLDG()">;
def hasLDU : Predicate<"Subtarget->hasLDU()">;
def hasPTXASUnreachableBug : Predicate<"Subtarget->hasPTXASUnreachableBug()">;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 8528ff702f236..19b370e4ce6f9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -33,6 +33,9 @@ def AS_match {
code shared = [{
return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_SHARED);
}];
+ code dshared = [{
+ return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_DSHARED);
+ }];
code global = [{
return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GLOBAL);
}];
@@ -1979,10 +1982,11 @@ class ATOMIC_GLOBAL_CHK <dag frag>
: PatFrag<!setdagop(frag, ops), frag, AS_match.global>;
class ATOMIC_SHARED_CHK <dag frag>
: PatFrag<!setdagop(frag, ops), frag, AS_match.shared>;
+class ATOMIC_DSHARED_CHK <dag frag>
+ : PatFrag<!setdagop(frag, ops), frag, AS_match.dshared>;
class ATOMIC_GENERIC_CHK <dag frag>
: PatFrag<!setdagop(frag, ops), frag, AS_match.generic>;
-
multiclass F_ATOMIC_2<RegTyInfo t, string sem_str, string as_str, string op_str,
SDPatternOperator op, list<Predicate> preds> {
defvar asm_str = "atom" # sem_str # as_str # "." # op_str # " \t$dst, [$addr], $b;";
@@ -2034,6 +2038,7 @@ multiclass F_ATOMIC_2_AS<RegTyInfo t, SDPatternOperator frag, string op_str, lis
defvar frag_pat = (frag node:$a, node:$b);
defm _G : F_ATOMIC_2<t, "", ".global", op_str, ATOMIC_GLOBAL_CHK<frag_pat>, preds>;
defm _S : F_ATOMIC_2<t, "", ".shared", op_str, ATOMIC_SHARED_CHK<frag_pat>, preds>;
+ defm _DS : F_ATOMIC_2<t, "", ".shared::cluster", op_str, ATOMIC_DSHARED_CHK<frag_pat>, !listconcat([hasSM<80>], preds)>;
defm _GEN : F_ATOMIC_2<t, "", "", op_str, ATOMIC_GENERIC_CHK<frag_pat>, preds>;
}
@@ -2041,6 +2046,7 @@ multiclass F_ATOMIC_3_AS<RegTyInfo t, SDPatternOperator frag, string sem_str, st
defvar frag_pat = (frag node:$a, node:$b, node:$c);
defm _G : F_ATOMIC_3<t, sem_str, ".global", op_str, ATOMIC_GLOBAL_CHK<frag_pat>, preds>;
defm _S : F_ATOMIC_3<t, sem_str, ".shared", op_str, ATOMIC_SHARED_CHK<frag_...
[truncated]
|
I wish PTX would be a bit more consistent about naming things. Documentation calls it distributed shared memory (and it is distributed, and is shared), but the PTX instructions, compiler builtins and intrinsics use shared::cluster (as opposed to regular shared AKA shared::cta). I would prefer to keep nomenclature used in LLVM consistent everywhere and stick with SHARED_CLUSTER, given that it is already in use and happens to be technically more precise ("Distributed" can mean anything, while cluster is unambiguous in specifying the scope of sharing domain) |
That sounds good. I'll rename with ADDRESS_SHARED_CLUSTER/llvm_shared_cluster_ptr_ty |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Backend changes look reasonable so far. One concern I have with this change is that until now we've assumed specific address-spaces are non-overlapping. You've addressed some of the places where this assumption is encoded but I think there are others you have not. For example, addrspacecasts between shared::cta and shared::cluster seem like they should be valid and should be expanded via a pair of cvta instructions through generic space, I think this would produce an error right now. I also wonder if InferAS or other places have made this assumption as well.
Opc = TM.is64Bit() ? NVPTX::cvta_shared_cluster_64 | ||
: NVPTX::cvta_shared_cluster; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
My understanding is that cluster is not supported until sm_90, and that sm_90+ do not support 32bit compilation. Is there something I'm missing? If not we should never select the 32-bit version here and instead check to ensure we're compiling for sm_90+.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
okay, I believe we still support 32-bit for shared-memory pointers, even in sm_90+?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah 32-bit shared memory pointers are still supported still as an option even in 64-bit compilation.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The check here is for TM.is64Bit()
not whether shared memory is 64 or 32 bits. What matters for cvta is the size of the generic pointers, which I think is always 64, right? If that is the case we'll never want the 32-bit variant. Even if there is something I'm missing and there are cases where we would want the 32 bit variant, we should still be checking PTX and SM version here and throwing an error as needed.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What matters for cvta is the size of the generic pointers, which I think is always 64, right?
It will depend on the compilation mode. You are correct that we can not produce valid code for shared_cluster conversions in 32-bit mode, so the question is -- how are we going to fail?
If we unconditionally use NVPTX::cvta_shared_cluster_64
we'll likely break other things because it will hardcode the assumption that generic pointer is 64-bit, while the rest of the DAG and the lowering machinery would be expecting it to be 32 bits. I suspect we'll run into an assertion somewhere. The bottom line is that we may still want to go through the motions, and pretend that we can convert AS to/from cluster_shared in 32-bit mode , so we can finish the compilation, and then let ptxas report an error. At least at that point the user would be able to examine the PTX and reason about what went wrong.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The check here is for TM.is64Bit() not whether shared memory is 64 or 32 bits.
Ah I see. Agreed then that 32-bit cvta.*shared::cluster
shouldn't exist as a valid construct during ISel. I think throwing the error in ISel is a good place to fail. ptxas will complain if we give it a 32-bit target machine on sm90+:
ptxas fatal : 32-Bit ABI (--machine 32 or 32-Bit addressing) is not supported on sm_90 or higher architectures.
But given that a 32-bit version doesn't exist we shouldn't generate one.
We should also update the documentation that cvta.*shared::cluster
is 64 bit only as well:
https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvta
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think throwing the error in ISel is a good place to fail.
It's a convenient place for LLVM developer. It's problematic for the end user. In general, if we're diagnosing a user error, it must be in a way that's actionable for the user, not for the compiler developer. Diagnostic by compiler crash is not the best UI.
Granted, producing an invalid instruction and relying on ptxas to diagnose it is only marginally better.
I think the missing bit of the puzzle here is that we still support 32-bit compilation on sm_90+. If we make it impossible to do it in principle, then it makes the whole point moot, and we no longer have to bother with 32-bit generic pointers for the instructions that are available on newer GPUs only.
This is something to be addressed separately. For this patch, I'm fine with proceeding with the assumption that 32-bit compilation never happens.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think the missing bit of the puzzle here is that we still support 32-bit compilation on sm_90+
Yeah that's true, having that be enforced early would resolve this problem. I like that as an overall solution to match that in reality there is only 64-bit mode available on sm_90+. Also agreed that can be addressed separately.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If a program/user requests 32-bit ABI and sm_90, that's an error. I'd expect this to be reported either by the frontend (e.g. clang) or when parsing the llvm module target data. At that point, NVPTX can just assume that this never happens and we can hard fail here to be helpful to other compiler developers, since they are the only ones that should be seeing this failure.
A general thought, |
(Sorry I clicked the wrong button |
In the auto-upgrade code it's doing exactly that where we cast to generic as an intermediary between shared::cta/shared::cluster. It does indeed error out during instruction selection if we do it directly. Given the conversion in IR is valid I agree it makes sense to directly support it and then expand it out to the double cast. I'll add that and a test to validate. |
That makes sense, will do once we're happy with all the changes. |
Made the requested changes, @AlexMaclean /@Artem-B PTAL when you get the chance--thanks! |
✅ With the latest revision this PR passed the undef deprecator. |
return std::make_pair(II->getArgOperand(0), llvm::ADDRESS_SPACE_SHARED); | ||
case Intrinsic::nvvm_isspacep_shared_cluster: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Technically this is conservative since nvvm_isspacep_shared_cluster == true
implies that the pointer is also under ADDRESS_SPACE_SHARED
. However the current interface only allows returning one address so that'll need to get updated. I'm planning on improving InferAddrSpaces to also detect from mapa/mapa_shared_cluster so I'll update this part as well during that.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
llvm changes LGTM, though I'm not too familiar with the MLIR portion of this change.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
MLIR side LGTM
Thanks for the thorough review! I'm splitting this so that this change only contains the addrspace + new instructions/intrinsics and the follow-up PR #136768 is updating the existing intrinsics. |
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/180/builds/16715 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/155/builds/8601 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/160/builds/16726 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/54/builds/8491 Here is the relevant piece of the build log for the reference
|
…36768) Adds support for new Shared Cluster Memory Address Space (SHARED_CLUSTER, addrspace 7). See https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#distributed-shared-memory for details. Follow-up to #135444 1. Update existing codegen/intrinsics in LLVM and MLIR that now use this address space 2. Auto-upgrade previous intrinsics that used SMEM (addrspace 3) but were really taking in a shared cluster pointer to the new address space
…e [2/2] (#136768) Adds support for new Shared Cluster Memory Address Space (SHARED_CLUSTER, addrspace 7). See https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#distributed-shared-memory for details. Follow-up to llvm/llvm-project#135444 1. Update existing codegen/intrinsics in LLVM and MLIR that now use this address space 2. Auto-upgrade previous intrinsics that used SMEM (addrspace 3) but were really taking in a shared cluster pointer to the new address space
…vm#135444) Adds support for new Shared Cluster Memory Address Space (SHARED_CLUSTER, addrspace 7). See https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#distributed-shared-memory for details. 1. Update address space structures and datalayout to contain the new space 2. Add new intrinsics that use this new address space 3. Update NVPTX alias analysis The existing intrinsics are updated in llvm#136768
ptxas needs to be updated to the correct arch to match what llc is outputting
…vm#136768) Adds support for new Shared Cluster Memory Address Space (SHARED_CLUSTER, addrspace 7). See https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#distributed-shared-memory for details. Follow-up to llvm#135444 1. Update existing codegen/intrinsics in LLVM and MLIR that now use this address space 2. Auto-upgrade previous intrinsics that used SMEM (addrspace 3) but were really taking in a shared cluster pointer to the new address space
…vm#135444) Adds support for new Shared Cluster Memory Address Space (SHARED_CLUSTER, addrspace 7). See https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#distributed-shared-memory for details. 1. Update address space structures and datalayout to contain the new space 2. Add new intrinsics that use this new address space 3. Update NVPTX alias analysis The existing intrinsics are updated in llvm#136768
ptxas needs to be updated to the correct arch to match what llc is outputting
…vm#136768) Adds support for new Shared Cluster Memory Address Space (SHARED_CLUSTER, addrspace 7). See https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#distributed-shared-memory for details. Follow-up to llvm#135444 1. Update existing codegen/intrinsics in LLVM and MLIR that now use this address space 2. Auto-upgrade previous intrinsics that used SMEM (addrspace 3) but were really taking in a shared cluster pointer to the new address space
…vm#135444) Adds support for new Shared Cluster Memory Address Space (SHARED_CLUSTER, addrspace 7). See https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#distributed-shared-memory for details. 1. Update address space structures and datalayout to contain the new space 2. Add new intrinsics that use this new address space 3. Update NVPTX alias analysis The existing intrinsics are updated in llvm#136768
ptxas needs to be updated to the correct arch to match what llc is outputting
…vm#136768) Adds support for new Shared Cluster Memory Address Space (SHARED_CLUSTER, addrspace 7). See https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#distributed-shared-memory for details. Follow-up to llvm#135444 1. Update existing codegen/intrinsics in LLVM and MLIR that now use this address space 2. Auto-upgrade previous intrinsics that used SMEM (addrspace 3) but were really taking in a shared cluster pointer to the new address space
Adds support for new Shared Cluster Memory Address Space (SHARED_CLUSTER, addrspace 7). See https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#distributed-shared-memory for details.
The existing intrinsics are updated in #136768