Skip to content

Commit 4b0c690

Browse files
committed
review feedback: split and add tests, remove 32-bit cvta.*shared::cluster, allow isel to lower shared::cta <-> shared::cluster addrspacecasts
1 parent 250db65 commit 4b0c690

File tree

8 files changed

+398
-235
lines changed

8 files changed

+398
-235
lines changed

llvm/lib/IR/AutoUpgrade.cpp

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4759,8 +4759,6 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
47594759
NewCall =
47604760
Builder.CreateCall(NewFn, {CI->getArgOperand(0), CI->getArgOperand(1)});
47614761
Value *Res = NewCall;
4762-
Res = Builder.CreateAddrSpaceCast(
4763-
Res, Builder.getPtrTy(NVPTXAS::ADDRESS_SPACE_GENERIC));
47644762
Res = Builder.CreateAddrSpaceCast(
47654763
Res, Builder.getPtrTy(NVPTXAS::ADDRESS_SPACE_SHARED));
47664764
NewCall->takeName(CI);
@@ -4778,10 +4776,8 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
47784776
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d:
47794777
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d:
47804778
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d: {
4781-
4779+
// Create a new call with the correct address space.
47824780
SmallVector<Value *, 4> Args(CI->args());
4783-
Args[0] = Builder.CreateAddrSpaceCast(
4784-
Args[0], Builder.getPtrTy(NVPTXAS::ADDRESS_SPACE_GENERIC));
47854781
Args[0] = Builder.CreateAddrSpaceCast(
47864782
Args[0], Builder.getPtrTy(NVPTXAS::ADDRESS_SPACE_SHARED_CLUSTER));
47874783

llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -983,8 +983,10 @@ void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
983983
Opc = TM.is64Bit() ? NVPTX::cvta_shared_64 : NVPTX::cvta_shared;
984984
break;
985985
case ADDRESS_SPACE_SHARED_CLUSTER:
986-
Opc = TM.is64Bit() ? NVPTX::cvta_shared_cluster_64
987-
: NVPTX::cvta_shared_cluster;
986+
if (!TM.is64Bit())
987+
report_fatal_error(
988+
"Shared cluster address space is only supported in 64-bit mode");
989+
Opc = NVPTX::cvta_shared_cluster_64;
988990
break;
989991
case ADDRESS_SPACE_CONST:
990992
Opc = TM.is64Bit() ? NVPTX::cvta_const_64 : NVPTX::cvta_const;
@@ -1009,8 +1011,10 @@ void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
10091011
Opc = TM.is64Bit() ? NVPTX::cvta_to_shared_64 : NVPTX::cvta_to_shared;
10101012
break;
10111013
case ADDRESS_SPACE_SHARED_CLUSTER:
1012-
Opc = TM.is64Bit() ? NVPTX::cvta_to_shared_cluster_64
1013-
: NVPTX::cvta_to_shared_cluster;
1014+
if (!TM.is64Bit())
1015+
report_fatal_error(
1016+
"Shared cluster address space is only supported in 64-bit mode");
1017+
Opc = NVPTX::cvta_to_shared_cluster_64;
10141018
break;
10151019
case ADDRESS_SPACE_CONST:
10161020
Opc = TM.is64Bit() ? NVPTX::cvta_to_const_64 : NVPTX::cvta_to_const;

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2993,8 +2993,42 @@ SDValue NVPTXTargetLowering::LowerADDRSPACECAST(SDValue Op,
29932993
unsigned SrcAS = N->getSrcAddressSpace();
29942994
unsigned DestAS = N->getDestAddressSpace();
29952995
if (SrcAS != llvm::ADDRESS_SPACE_GENERIC &&
2996-
DestAS != llvm::ADDRESS_SPACE_GENERIC)
2996+
DestAS != llvm::ADDRESS_SPACE_GENERIC) {
2997+
// Shared and SharedCluster can be converted to each other through generic
2998+
// space
2999+
if (SrcAS == llvm::ADDRESS_SPACE_SHARED &&
3000+
DestAS == llvm::ADDRESS_SPACE_SHARED_CLUSTER) {
3001+
const MVT GenerictVT =
3002+
getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_GENERIC);
3003+
const MVT SharedClusterVT =
3004+
getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_SHARED_CLUSTER);
3005+
SDValue GenericConversion =
3006+
DAG.getAddrSpaceCast(SDLoc(), GenerictVT, Op.getOperand(0),
3007+
ADDRESS_SPACE_SHARED, ADDRESS_SPACE_GENERIC);
3008+
SDValue SharedClusterConversion = DAG.getAddrSpaceCast(
3009+
SDLoc(), SharedClusterVT, GenericConversion, ADDRESS_SPACE_GENERIC,
3010+
ADDRESS_SPACE_SHARED_CLUSTER);
3011+
return SharedClusterConversion;
3012+
}
3013+
3014+
if (SrcAS == llvm::ADDRESS_SPACE_SHARED_CLUSTER &&
3015+
DestAS == llvm::ADDRESS_SPACE_SHARED) {
3016+
const MVT GenerictVT =
3017+
getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_GENERIC);
3018+
const MVT SharedVT =
3019+
getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_SHARED);
3020+
SDValue GenericConversion = DAG.getAddrSpaceCast(
3021+
SDLoc(), GenerictVT, Op.getOperand(0), ADDRESS_SPACE_SHARED_CLUSTER,
3022+
ADDRESS_SPACE_GENERIC);
3023+
SDValue SharedConversion =
3024+
DAG.getAddrSpaceCast(SDLoc(), SharedVT, GenericConversion,
3025+
ADDRESS_SPACE_GENERIC, ADDRESS_SPACE_SHARED);
3026+
return SharedConversion;
3027+
}
3028+
29973029
return DAG.getUNDEF(Op.getValueType());
3030+
}
3031+
29983032
return Op;
29993033
}
30003034

llvm/lib/Target/NVPTX/NVPTXIntrinsics.td

Lines changed: 20 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2038,15 +2038,15 @@ multiclass F_ATOMIC_2_AS<RegTyInfo t, SDPatternOperator frag, string op_str, lis
20382038
defvar frag_pat = (frag node:$a, node:$b);
20392039
defm _G : F_ATOMIC_2<t, "", ".global", op_str, ATOMIC_GLOBAL_CHK<frag_pat>, preds>;
20402040
defm _S : F_ATOMIC_2<t, "", ".shared", op_str, ATOMIC_SHARED_CHK<frag_pat>, preds>;
2041-
defm _S_C : F_ATOMIC_2<t, "", ".shared::cluster", op_str, ATOMIC_SHARED_CLUSTER_CHK<frag_pat>, !listconcat([hasSM<80>], preds)>;
2041+
defm _S_C : F_ATOMIC_2<t, "", ".shared::cluster", op_str, ATOMIC_SHARED_CLUSTER_CHK<frag_pat>, !listconcat([hasClusters], preds)>;
20422042
defm _GEN : F_ATOMIC_2<t, "", "", op_str, ATOMIC_GENERIC_CHK<frag_pat>, preds>;
20432043
}
20442044

20452045
multiclass F_ATOMIC_3_AS<RegTyInfo t, SDPatternOperator frag, string sem_str, string op_str, list<Predicate> preds = []> {
20462046
defvar frag_pat = (frag node:$a, node:$b, node:$c);
20472047
defm _G : F_ATOMIC_3<t, sem_str, ".global", op_str, ATOMIC_GLOBAL_CHK<frag_pat>, preds>;
20482048
defm _S : F_ATOMIC_3<t, sem_str, ".shared", op_str, ATOMIC_SHARED_CHK<frag_pat>, preds>;
2049-
defm _S_C : F_ATOMIC_3<t, sem_str, ".shared::cluster", op_str, ATOMIC_SHARED_CLUSTER_CHK<frag_pat>, !listconcat([hasSM<80>], preds)>;
2049+
defm _S_C : F_ATOMIC_3<t, sem_str, ".shared::cluster", op_str, ATOMIC_SHARED_CLUSTER_CHK<frag_pat>, !listconcat([hasClusters], preds)>;
20502050
defm _GEN : F_ATOMIC_3<t, sem_str, "", op_str, ATOMIC_GENERIC_CHK<frag_pat>, preds>;
20512051
}
20522052

@@ -2327,29 +2327,39 @@ def INT_PTX_LDG_G_v4i32_ELE : VLDG_G_ELE_V4<"u32", Int32Regs>;
23272327
def INT_PTX_LDG_G_v4f32_ELE : VLDG_G_ELE_V4<"f32", Float32Regs>;
23282328

23292329

2330-
multiclass NG_TO_G<string Str> {
2330+
multiclass NG_TO_G<string Str, list<Predicate> Preds = []> {
23312331
def "" : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
2332-
"cvta." # Str # ".u32 \t$result, $src;", []>;
2332+
"cvta." # Str # ".u32 \t$result, $src;", []>, Requires<Preds>;
23332333
def _64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
2334-
"cvta." # Str # ".u64 \t$result, $src;", []>;
2334+
"cvta." # Str # ".u64 \t$result, $src;", []>, Requires<Preds>;
23352335
}
23362336

2337-
multiclass G_TO_NG<string Str> {
2337+
multiclass NG_TO_G_64<string Str, list<Predicate> Preds = []> {
2338+
def _64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
2339+
"cvta." # Str # ".u64 \t$result, $src;", []>, Requires<Preds>;
2340+
}
2341+
2342+
multiclass G_TO_NG<string Str, list<Predicate> Preds = []> {
23382343
def "" : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
2339-
"cvta.to." # Str # ".u32 \t$result, $src;", []>;
2344+
"cvta.to." # Str # ".u32 \t$result, $src;", []>, Requires<Preds>;
2345+
def _64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
2346+
"cvta.to." # Str # ".u64 \t$result, $src;", []>, Requires<Preds>;
2347+
}
2348+
2349+
multiclass G_TO_NG_64<string Str, list<Predicate> Preds = []> {
23402350
def _64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
2341-
"cvta.to." # Str # ".u64 \t$result, $src;", []>;
2351+
"cvta.to." # Str # ".u64 \t$result, $src;", []>, Requires<Preds>;
23422352
}
23432353

23442354
defm cvta_local : NG_TO_G<"local">;
23452355
defm cvta_shared : NG_TO_G<"shared">;
2346-
defm cvta_shared_cluster : NG_TO_G<"shared::cluster">;
2356+
defm cvta_shared_cluster : NG_TO_G_64<"shared::cluster", [hasClusters]>;
23472357
defm cvta_global : NG_TO_G<"global">;
23482358
defm cvta_const : NG_TO_G<"const">;
23492359

23502360
defm cvta_to_local : G_TO_NG<"local">;
23512361
defm cvta_to_shared : G_TO_NG<"shared">;
2352-
defm cvta_to_shared_cluster : G_TO_NG<"shared::cluster">;
2362+
defm cvta_to_shared_cluster : G_TO_NG_64<"shared::cluster", [hasClusters]>;
23532363
defm cvta_to_global : G_TO_NG<"global">;
23542364
defm cvta_to_const : G_TO_NG<"const">;
23552365

llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -263,18 +263,18 @@ define void @nvvm_shared_cluster_intrinsics(ptr addrspace(3) %p0, i32 %offset) {
263263

264264
; CHECK-LABEL: @nvvm_cp_async_bulk_intrinsics
265265
define void @nvvm_cp_async_bulk_intrinsics(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr addrspace(1) %src, ptr addrspace(3) %src_shared, i32 %size) {
266-
; CHECK: call void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(7) %2, ptr addrspace(3) %bar, ptr addrspace(1) %src, i32 %size, i16 0, i64 0, i1 false, i1 false)
267-
; CHECK: call void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(7) %4, ptr addrspace(3) %bar, ptr addrspace(3) %src_shared, i32 %size)
266+
; CHECK: call void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(7) %1, ptr addrspace(3) %bar, ptr addrspace(1) %src, i32 %size, i16 0, i64 0, i1 false, i1 false)
267+
; CHECK: call void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(7) %2, ptr addrspace(3) %bar, ptr addrspace(3) %src_shared, i32 %size)
268268
call void @llvm.nvvm.cp.async.bulk.global.to.shared.cluster(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr addrspace(1) %src, i32 %size, i16 0, i64 0, i1 false, i1 false)
269269
call void @llvm.nvvm.cp.async.bulk.shared.cta.to.cluster(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr addrspace(3) %src_shared, i32 %size)
270270
ret void
271271
}
272272

273273
; CHECK-LABEL: @nvvm_cp_async_bulk_tensor_g2s_im2col
274274
define void @nvvm_cp_async_bulk_tensor_g2s_im2col(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch) {
275-
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %2, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 undef, i64 undef, i1 false, i1 false)
276-
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %4, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 undef, i64 undef, i1 false, i1 false)
277-
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %6, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 undef, i64 undef, i1 false, i1 false)
275+
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %1, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 undef, i64 undef, i1 false, i1 false)
276+
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %2, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 undef, i64 undef, i1 false, i1 false)
277+
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %3, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 undef, i64 undef, i1 false, i1 false)
278278
call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 undef, i64 undef, i1 0, i1 0)
279279
call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 undef, i64 undef, i1 0, i1 0)
280280
call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 undef, i64 undef, i1 0, i1 0)
@@ -283,11 +283,11 @@ define void @nvvm_cp_async_bulk_tensor_g2s_im2col(ptr addrspace(3) %d, ptr addrs
283283

284284
; CHECK-LABEL: @nvvm_cp_async_bulk_tensor_g2s_tile
285285
define void @nvvm_cp_async_bulk_tensor_g2s_tile(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch) {
286-
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %2, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 undef, i64 undef, i1 false, i1 false)
287-
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %4, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 undef, i64 undef, i1 false, i1 false)
288-
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %6, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 undef, i64 undef, i1 false, i1 false)
289-
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %8, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 undef, i64 undef, i1 false, i1 false)
290-
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %10, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 undef, i64 undef, i1 false, i1 false)
286+
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %1, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 undef, i64 undef, i1 false, i1 false)
287+
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %2, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 undef, i64 undef, i1 false, i1 false)
288+
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %3, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 undef, i64 undef, i1 false, i1 false)
289+
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %4, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 undef, i64 undef, i1 false, i1 false)
290+
; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %5, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 undef, i64 undef, i1 false, i1 false)
291291
call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 undef, i64 undef, i1 0, i1 0)
292292
call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 undef, i64 undef, i1 0, i1 0)
293293
call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 undef, i64 undef, i1 0, i1 0)
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
; RUN: llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_80 | FileCheck %s -check-prefixes=ALL,NOPTRCONV,CLS64
2+
; RUN: llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_80 --nvptx-short-ptr | FileCheck %s -check-prefixes=ALL,PTRCONV,CLS64
3+
; RUN: %if ptxas-12.8 %{ llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_80 | %ptxas-verify %}
4+
; RUN: %if ptxas-12.8 %{ llc -O0 < %s -mtriple=nvptx64 -mcpu=sm_80 --nvptx-short-ptr | %ptxas-verify %}
5+
6+
; ALL-LABEL: conv_shared_cluster_to_generic
7+
define i32 @conv_shared_cluster_to_generic(ptr addrspace(7) %ptr) {
8+
; CLS32: cvta.shared::cluster.u32
9+
; PTRCONV: cvt.u64.u32
10+
; NOPTRCONV-NOT: cvt.u64.u32
11+
; CLS64: cvta.shared::cluster.u64
12+
; ALL: ld.u32
13+
%genptr = addrspacecast ptr addrspace(7) %ptr to ptr
14+
%val = load i32, ptr %genptr
15+
ret i32 %val
16+
}
17+
18+
; ALL-LABEL: conv_generic_to_shared_cluster
19+
define i32 @conv_generic_to_shared_cluster(ptr %ptr) {
20+
; CLS32: cvta.to.shared::cluster.u32
21+
; CLS64: cvta.to.shared::cluster.u64
22+
; PTRCONV: cvt.u32.u64
23+
; NOPTRCONV-NOT: cvt.u32.u64
24+
; ALL: ld.shared::cluster.u32
25+
%specptr = addrspacecast ptr %ptr to ptr addrspace(7)
26+
%val = load i32, ptr addrspace(7) %specptr
27+
ret i32 %val
28+
}
29+
30+
; ALL-LABEL: conv_shared_to_shared_cluster
31+
define i32 @conv_shared_to_shared_cluster(ptr addrspace(3) %ptr) {
32+
; CLS64: cvta.shared.u64
33+
; CLS64: cvta.to.shared::cluster.u64
34+
; ALL: ld.shared::cluster.u32
35+
%specptr = addrspacecast ptr addrspace(3) %ptr to ptr addrspace(7)
36+
%val = load i32, ptr addrspace(7) %specptr
37+
ret i32 %val
38+
}
39+
40+
; ALL-LABEL: conv_shared_cluster_to_shared
41+
define i32 @conv_shared_cluster_to_shared(ptr addrspace(7) %ptr) {
42+
; CLS64: cvta.shared::cluster.u64
43+
; CLS64: cvta.to.shared.u64
44+
; ALL: ld.shared.u32
45+
%specptr = addrspacecast ptr addrspace(7) %ptr to ptr addrspace(3)
46+
%val = load i32, ptr addrspace(3) %specptr
47+
ret i32 %val
48+
}

0 commit comments

Comments
 (0)