-
Notifications
You must be signed in to change notification settings - Fork 14.4k
AMDGPU: Implement tensor load and store instructions for gfx1250 #146636
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
Co-Authored-by: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
@llvm/pr-subscribers-clang @llvm/pr-subscribers-mc Author: Changpeng Fang (changpeng) ChangesPatch is 42.91 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/146636.diff 20 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 5133947c498ca..fb358297a5eed 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -640,6 +640,11 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_f16_f32, "V2hV2hfUiIb", "nc", "f32-to-f16
// GFX1250+ only builtins.
//===----------------------------------------------------------------------===//
+TARGET_BUILTIN(__builtin_amdgcn_tensor_load_to_lds, "vV4iV8iV4iV4iIi", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_tensor_load_to_lds_d2, "vV4iV8iIi", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_tensor_store_from_lds, "vV4iV8iV4iV4iIi", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_tensor_store_from_lds_d2, "vV4iV8iIi", "nc", "gfx1250-insts")
+
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr4_b64_v2i32, "V2iV2i*1", "nc", "transpose-load-f4f6-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr8_b64_v2i32, "V2iV2i*1", "nc", "gfx1250-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr6_b96_v3i32, "V3iV3i*1", "nc", "transpose-load-f4f6-insts,wavefrontsize32")
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index f09b3b92c4ea0..1fc2d57d4941c 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -621,6 +621,32 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
llvm::Function *F = CGM.getIntrinsic(IID, {LoadTy});
return Builder.CreateCall(F, {Addr});
}
+ case AMDGPU::BI__builtin_amdgcn_tensor_load_to_lds:
+ case AMDGPU::BI__builtin_amdgcn_tensor_load_to_lds_d2:
+ case AMDGPU::BI__builtin_amdgcn_tensor_store_from_lds:
+ case AMDGPU::BI__builtin_amdgcn_tensor_store_from_lds_d2: {
+ Intrinsic::ID IID;
+ switch (BuiltinID) {
+ case AMDGPU::BI__builtin_amdgcn_tensor_load_to_lds:
+ IID = Intrinsic::amdgcn_tensor_load_to_lds;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_tensor_load_to_lds_d2:
+ IID = Intrinsic::amdgcn_tensor_load_to_lds_d2;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_tensor_store_from_lds:
+ IID = Intrinsic::amdgcn_tensor_store_from_lds;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_tensor_store_from_lds_d2:
+ IID = Intrinsic::amdgcn_tensor_store_from_lds_d2;
+ break;
+ }
+
+ SmallVector<Value *, 5> Args;
+ for (int i = 0, e = E->getNumArgs(); i != e; ++i)
+ Args.push_back(EmitScalarExpr(E->getArg(i)));
+ llvm::Function *F = CGM.getIntrinsic(IID, {});
+ return Builder.CreateCall(F, {Args});
+ }
case AMDGPU::BI__builtin_amdgcn_load_to_lds: {
// Should this have asan instrumentation?
return emitBuiltinWithOneOverloadedType<5>(*this, E,
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-tensor-load-store.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-tensor-load-store.cl
new file mode 100644
index 0000000000000..49ffbf4517160
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-tensor-load-store.cl
@@ -0,0 +1,46 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1250 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1250
+
+typedef int v4i __attribute__((ext_vector_type(4)));
+typedef int v8i __attribute__((ext_vector_type(8)));
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_load_to_lds(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> [[SG2:%.*]], <4 x i32> [[SG3:%.*]], i32 0)
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_tensor_load_to_lds(v4i sg0, v8i sg1, v4i sg2, v4i sg3)
+{
+ __builtin_amdgcn_tensor_load_to_lds(sg0, sg1, sg2, sg3, 0);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_load_to_lds_d2(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], i32 27)
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_tensor_load_to_lds_d2(v4i sg0, v8i sg1)
+{
+ __builtin_amdgcn_tensor_load_to_lds_d2(sg0, sg1, 27);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_store_from_lds(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> [[SG2:%.*]], <4 x i32> [[SG3:%.*]], i32 22)
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_tensor_store_from_lds(v4i sg0, v8i sg1, v4i sg2, v4i sg3)
+{
+ __builtin_amdgcn_tensor_store_from_lds(sg0, sg1, sg2, sg3, 22);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_store_from_lds_d2(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.store.from.lds.d2(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], i32 0)
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_tensor_store_from_lds_d2(v4i sg0, v8i sg1)
+{
+ __builtin_amdgcn_tensor_store_from_lds_d2(sg0, sg1, 0);
+}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
index 69857087bae08..3ba0d50e79031 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
@@ -1,6 +1,9 @@
// REQUIRES: amdgpu-registered-target
// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx1250 -verify -S -o - %s
+typedef int v4i __attribute__((ext_vector_type(4)));
+typedef int v8i __attribute__((ext_vector_type(8)));
+
void test_setprio_inc_wg(short a) {
__builtin_amdgcn_s_setprio_inc_wg(a); // expected-error {{'__builtin_amdgcn_s_setprio_inc_wg' must be a constant integer}}
}
@@ -16,3 +19,11 @@ void test__builtin_amdgcn_cvt_f16_fp8(int a, int b) {
void test__builtin_amdgcn_cvt_f16_bf8(int a, int b) {
__builtin_amdgcn_cvt_f16_bf8(a, b); // expected-error {{'__builtin_amdgcn_cvt_f16_bf8' must be a constant integer}}
}
+
+void test_amdgcn_tensor_load_store(v4i sg0, v8i sg1, v4i sg2, v4i sg3, int cpol)
+{
+ __builtin_amdgcn_tensor_load_to_lds(sg0, sg1, sg2, sg3, cpol); // expected-error {{'__builtin_amdgcn_tensor_load_to_lds' must be a constant integer}}
+ __builtin_amdgcn_tensor_load_to_lds_d2(sg0, sg1, cpol); // expected-error {{'__builtin_amdgcn_tensor_load_to_lds_d2' must be a constant integer}}
+ __builtin_amdgcn_tensor_store_from_lds(sg0, sg1, sg2, sg3, cpol); // expected-error {{'__builtin_amdgcn_tensor_store_from_lds' must be a constant integer}}
+ __builtin_amdgcn_tensor_store_from_lds_d2(sg0, sg1, cpol); // expected-error {{'__builtin_amdgcn_tensor_store_from_lds_d2' must be a constant integer}}
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index a0a81568424f5..2aabf6109022f 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -3580,6 +3580,37 @@ def int_amdgcn_fdiv_fast : DefaultAttrsIntrinsic<
[IntrNoMem, IntrSpeculatable]
>;
+class AMDGPUTensorLoadStore:
+ Intrinsic<
+ [],
+ [llvm_v4i32_ty, // D# group 0
+ llvm_v8i32_ty, // D# group 1
+ llvm_v4i32_ty, // D# group 2
+ llvm_v4i32_ty, // D# group 3
+ llvm_i32_ty], // cachepolicy:
+ // bits [0-2] = th
+ // bits [3-4] = scope
+ [IntrInaccessibleMemOrArgMemOnly, ImmArg<ArgIndex<4>>, IntrWillReturn, IntrConvergent, IntrNoCallback, IntrNoFree],
+ "", [SDNPMemOperand]
+ >;
+
+class AMDGPUTensorLoadStoreD2:
+ Intrinsic<
+ [],
+ [llvm_v4i32_ty, // D# group 0
+ llvm_v8i32_ty, // D# group 1
+ llvm_i32_ty], // cachepolicy:
+ // bits [0-2] = th
+ // bits [3-4] = scope
+ [IntrInaccessibleMemOrArgMemOnly, ImmArg<ArgIndex<2>>, IntrWillReturn, IntrConvergent, IntrNoCallback, IntrNoFree],
+ "", [SDNPMemOperand]
+ >;
+
+def int_amdgcn_tensor_load_to_lds : AMDGPUTensorLoadStore;
+def int_amdgcn_tensor_store_from_lds : AMDGPUTensorLoadStore;
+def int_amdgcn_tensor_load_to_lds_d2 : AMDGPUTensorLoadStoreD2;
+def int_amdgcn_tensor_store_from_lds_d2 : AMDGPUTensorLoadStoreD2;
+
/// Emit an addrspacecast without null pointer checking.
/// Should only be inserted by a pass based on analysis of an addrspacecast's src.
def int_amdgcn_addrspacecast_nonnull : DefaultAttrsIntrinsic<
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 6f6d7b8d99af5..353fb23fa1520 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3348,6 +3348,20 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
MI.eraseFromParent();
return;
}
+ case Intrinsic::amdgcn_tensor_load_to_lds:
+ case Intrinsic::amdgcn_tensor_store_from_lds: {
+ constrainOpWithReadfirstlane(B, MI, 1);
+ constrainOpWithReadfirstlane(B, MI, 2);
+ constrainOpWithReadfirstlane(B, MI, 3);
+ constrainOpWithReadfirstlane(B, MI, 4);
+ return;
+ }
+ case Intrinsic::amdgcn_tensor_load_to_lds_d2:
+ case Intrinsic::amdgcn_tensor_store_from_lds_d2: {
+ constrainOpWithReadfirstlane(B, MI, 1);
+ constrainOpWithReadfirstlane(B, MI, 2);
+ return;
+ }
default: {
if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
AMDGPU::lookupRsrcIntrinsic(IntrID)) {
@@ -5354,6 +5368,22 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
}
case Intrinsic::amdgcn_pops_exiting_wave_id:
return getDefaultMappingSOP(MI);
+ case Intrinsic::amdgcn_tensor_load_to_lds_d2:
+ case Intrinsic::amdgcn_tensor_store_from_lds_d2:
+ case Intrinsic::amdgcn_tensor_load_to_lds:
+ case Intrinsic::amdgcn_tensor_store_from_lds: {
+ // Lie and claim everything is legal, even all operands need to be
+ // SGPRs. applyMapping will have to deal with it with readfirstlane.
+ for (unsigned I = 1; I < MI.getNumOperands(); ++I) {
+ if (MI.getOperand(I).isReg()) {
+ Register Reg = MI.getOperand(I).getReg();
+ auto OpBank = getRegBankID(Reg, MRI);
+ unsigned Size = getSizeInBits(Reg, MRI, *TRI);
+ OpdsMapping[I] = AMDGPU::getValueMapping(OpBank, Size);
+ }
+ }
+ break;
+ }
case Intrinsic::amdgcn_s_prefetch_data: {
OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 14fec71996a0e..c429e95f52a9d 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1784,6 +1784,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
bool validateMIMGAddrSize(const MCInst &Inst, const SMLoc &IDLoc);
bool validateMIMGD16(const MCInst &Inst);
bool validateMIMGDim(const MCInst &Inst, const OperandVector &Operands);
+ bool validateTensorR128(const MCInst &Inst);
bool validateMIMGMSAA(const MCInst &Inst);
bool validateOpSel(const MCInst &Inst);
bool validateTrue16OpSel(const MCInst &Inst);
@@ -4280,6 +4281,20 @@ bool AMDGPUAsmParser::validateMIMGD16(const MCInst &Inst) {
return true;
}
+bool AMDGPUAsmParser::validateTensorR128(const MCInst &Inst) {
+ const unsigned Opc = Inst.getOpcode();
+ const MCInstrDesc &Desc = MII.get(Opc);
+
+ if ((Desc.TSFlags & SIInstrFlags::TENSOR_CNT) == 0)
+ return true;
+
+ int R128Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::r128);
+ if (R128Idx >= 0 && Inst.getOperand(R128Idx).getImm())
+ return false;
+
+ return true;
+}
+
static bool IsRevOpcode(const unsigned Opcode)
{
switch (Opcode) {
@@ -5113,14 +5128,11 @@ bool AMDGPUAsmParser::validateTHAndScopeBits(const MCInst &Inst,
return PrintError("scope and th combination is not valid");
}
- bool IsStore = TID.mayStore();
- bool IsAtomic =
- TID.TSFlags & (SIInstrFlags::IsAtomicNoRet | SIInstrFlags::IsAtomicRet);
-
- if (IsAtomic) {
+ unsigned THType = AMDGPU::getTemporalHintType(TID);
+ if (THType == AMDGPU::CPol::TH_TYPE_ATOMIC) {
if (!(CPol & AMDGPU::CPol::TH_TYPE_ATOMIC))
return PrintError("invalid th value for atomic instructions");
- } else if (IsStore) {
+ } else if (THType == AMDGPU::CPol::TH_TYPE_STORE) {
if (!(CPol & AMDGPU::CPol::TH_TYPE_STORE))
return PrintError("invalid th value for store instructions");
} else {
@@ -5205,6 +5217,11 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
Error(IDLoc, "missing dim operand");
return false;
}
+ if (!validateTensorR128(Inst)) {
+ Error(getImmLoc(AMDGPUOperand::ImmTyD16, Operands),
+ "instruction must set modifier r128=0");
+ return false;
+ }
if (!validateMIMGMSAA(Inst)) {
Error(getImmLoc(AMDGPUOperand::ImmTyDim, Operands),
"invalid dim; must be MSAA type");
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index a6ce42dca92be..fa1474d153834 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -173,13 +173,12 @@ void AMDGPUInstPrinter::printTH(const MCInst *MI, int64_t TH, int64_t Scope,
const unsigned Opcode = MI->getOpcode();
const MCInstrDesc &TID = MII.get(Opcode);
- bool IsStore = TID.mayStore();
- bool IsAtomic =
- TID.TSFlags & (SIInstrFlags::IsAtomicNoRet | SIInstrFlags::IsAtomicRet);
+ unsigned THType = AMDGPU::getTemporalHintType(TID);
+ bool IsStore = (THType == AMDGPU::CPol::TH_TYPE_STORE);
O << " th:";
- if (IsAtomic) {
+ if (THType == AMDGPU::CPol::TH_TYPE_ATOMIC) {
O << "TH_ATOMIC_";
if (TH & AMDGPU::CPol::TH_ATOMIC_CASCADE) {
if (Scope >= AMDGPU::CPol::SCOPE_DEV)
@@ -196,9 +195,6 @@ void AMDGPUInstPrinter::printTH(const MCInst *MI, int64_t TH, int64_t Scope,
if (!IsStore && TH == AMDGPU::CPol::TH_RESERVED)
O << formatHex(TH);
else {
- // This will default to printing load variants when neither MayStore nor
- // MayLoad flag is present which is the case with instructions like
- // image_get_resinfo.
O << (IsStore ? "TH_STORE_" : "TH_LOAD_");
switch (TH) {
case AMDGPU::CPol::TH_NT:
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index 8d94d73bc1aab..531fae3ceff59 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -2019,3 +2019,97 @@ def : MIMGG16Mapping<IMAGE_SAMPLE_CD_O_nortn, IMAGE_SAMPLE_CD_O_G16_nortn>;
def : MIMGG16Mapping<IMAGE_SAMPLE_CD_CL_O_nortn, IMAGE_SAMPLE_CD_CL_O_G16_nortn>;
def : MIMGG16Mapping<IMAGE_SAMPLE_C_CD_O_nortn, IMAGE_SAMPLE_C_CD_O_G16_nortn>;
def : MIMGG16Mapping<IMAGE_SAMPLE_C_CD_CL_O_nortn, IMAGE_SAMPLE_C_CD_CL_O_G16_nortn>;
+
+//===----------------------------------------------------------------------===//
+// VIMAGE Tensor Instructions
+//===----------------------------------------------------------------------===//
+
+class VIMAGE_TENSOR_Pseudo<string opName, bit _UpTo2D = 0> :
+ InstSI<(outs ), (ins ), "", []>,
+ SIMCInstr<opName#!if(_UpTo2D, "_D2", ""), SIEncodingFamily.NONE> {
+
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+ string Mnemonic = opName;
+
+ let VALU = 1;
+ let maybeAtomic = 0;
+ let TENSOR_CNT = 1;
+ let mayLoad = 1;
+ let mayStore = 1;
+ let Uses = [EXEC, TENSORcnt];
+ let Defs = [TENSORcnt];
+ let SchedRW = [WriteVMEM, WriteLDS];
+ let UseNamedOperandTable = 1;
+ let hasSideEffects = 0;
+
+ bit UpTo2D = _UpTo2D;
+ let InOperandList = !if(UpTo2D, (ins SReg_128:$vaddr0, SReg_256:$vaddr1, R128A16:$r128, CPol:$cpol),
+ (ins SReg_128:$vaddr0, SReg_256:$vaddr1, SReg_128:$vaddr2,
+ SReg_128:$vaddr3, R128A16:$r128, CPol:$cpol));
+ string AsmOperands = " $vaddr0, $vaddr1"#!if(UpTo2D, "", ", $vaddr2, $vaddr3")#"$r128$cpol";
+}
+
+let SubtargetPredicate = isGFX1250Plus in {
+def TENSOR_LOAD_TO_LDS : VIMAGE_TENSOR_Pseudo<"tensor_load_to_lds">;
+def TENSOR_STORE_FROM_LDS : VIMAGE_TENSOR_Pseudo<"tensor_store_from_lds">;
+def TENSOR_LOAD_TO_LDS_D2 : VIMAGE_TENSOR_Pseudo<"tensor_load_to_lds", 1>;
+def TENSOR_STORE_FROM_LDS_D2 : VIMAGE_TENSOR_Pseudo<"tensor_store_from_lds", 1>;
+} // End SubtargetPredicate = isGFX1250Plus.
+
+class TensorPat <VIMAGE_TENSOR_Pseudo inst, SDPatternOperator node> : GCNPat <
+ (node v4i32:$vaddr0, v8i32:$vaddr1, v4i32:$vaddr2, v4i32:$vaddr3, (i32 timm:$cpol)),
+ (inst $vaddr0, $vaddr1, $vaddr2, $vaddr3, 0, $cpol)
+>;
+
+class TensorD2Pat <VIMAGE_TENSOR_Pseudo inst, SDPatternOperator node> : GCNPat <
+ (node v4i32:$vaddr0, v8i32:$vaddr1, (i32 timm:$cpol)),
+ (inst $vaddr0, $vaddr1, 0, $cpol)
+>;
+
+let SubtargetPredicate = isGFX1250Plus in {
+def : TensorPat <TENSOR_LOAD_TO_LDS, int_amdgcn_tensor_load_to_lds>;
+def : TensorPat <TENSOR_STORE_FROM_LDS, int_amdgcn_tensor_store_from_lds>;
+def : TensorD2Pat <TENSOR_LOAD_TO_LDS_D2, int_amdgcn_tensor_load_to_lds_d2>;
+def : TensorD2Pat <TENSOR_STORE_FROM_LDS_D2, int_amdgcn_tensor_store_from_lds_d2>;
+}
+
+class VIMAGE_TENSOR_Real <bits<8> op, VIMAGE_TENSOR_Pseudo ps, string opName = ps.Mnemonic> :
+ InstSI <ps.OutOperandList, ps.InOperandList, opName # ps.AsmOperands, []>,
+ VIMAGEe<op> {
+
+ // copy relevant pseudo op flags
+ let SubtargetPredicate = ps.SubtargetPredicate;
+ let TSFlags = ps.TSFlags;
+ let mayLoad = ps.mayLoad;
+ let mayStore = ps.mayStore;
+ let UseNamedOperandTable = ps.UseNamedOperandTable;
+ let SchedRW = ps.SchedRW;
+
+ // D# group 2 and 3 set to NULL for 2D or less.
+ let vaddr2 = !if(ps.UpTo2D, !cast<int>(SGPR_NULL_gfx11plus.HWEncoding), ?);
+ let vaddr3 = !if(ps.UpTo2D, !cast<int>(SGPR_NULL_gfx11plus.HWEncoding), ?);
+
+ // set to 0 based on SPG.
+ let vaddr4 = 0;
+ let rsrc = 0;
+ let vdata = 0;
+ let d16 = 0;
+ let a16 = 0;
+ let tfe = 0;
+ let dmask = 1; // sp3
+ let dim = 1; // sp3
+}
+
+multiclass VIMAGE_TENSOR_Real_gfx1250<bits<8> op> {
+ let AssemblerPredicate = isGFX1250Plus, DecoderNamespace = "GFX1250" in {
+ foreach DSuffix = ["_D2", ""] in {
+ defvar ps = !cast<VIMAGE_TENSOR_Pseudo>(NAME # DSuffix);
+ def DSuffix # _gfx1250 : VIMAGE_TENSOR_Real<op, ps, ps.Mnemonic>,
+ SIMCInstr<ps.PseudoInstr, SIEncodingFamily.GFX1250>;
+ }
+ }
+}
+
+defm TENSOR_LOAD_TO_LDS : VIMAGE_TENSOR_Real_gfx1250<0xc4>;
+defm TENSOR_STORE_FROM_LDS : VIMAGE_TENSOR_Real_gfx1250<0xc5>;
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 4b72f66abbd76..76e29e4393206 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -107,8 +107,7 @@ enum : uint64_t {
DisableWQM = UINT64_C(1) << 36,
Gather4 = UINT64_C(1) << 37,
- // Reserved, must be 0.
- Reserved0 = UINT64_C(1) << 38,
+ TENSOR_CNT = UINT64_C(1) << 38,
SCALAR_STORE = UINT64_C(1) << 39,
FIXED_SIZE = UINT64_C(1) << 40,
diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
index 42aae35112cac..c27d4e0df6fc5 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -68,6 +68,9 @@ class InstSI <dag outs, dag ins, string asm = "",
field bit Gather4 = 0;
+ // wait count to manage tensor loads/stores.
+ field bit TENSOR_CNT = 0;
+
// This is an s_store_dword* instruction that requires a cache flush
// on wave termination. It is necessary to distinguish from mayStore
// SMEM instructions like the cache flush ones.
@@ -201,8 +204,7 @@ class InstSI <dag outs, dag ins, string asm = "",
let TSFlags{36} = DisableWQM;
let TSFlags{37} = Gather4;
- // Reserved, must be 0.
- let TSFlags{38} = 0;
+ let TSFlags{38} = TENSOR_CNT;
let TSFlags{39} = ScalarStore;
let TSFlags{40} = FixedSize;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/li...
[truncated]
|
@llvm/pr-subscribers-backend-amdgpu Author: Changpeng Fang (changpeng) ChangesPatch is 42.91 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/146636.diff 20 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 5133947c498ca..fb358297a5eed 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -640,6 +640,11 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_f16_f32, "V2hV2hfUiIb", "nc", "f32-to-f16
// GFX1250+ only builtins.
//===----------------------------------------------------------------------===//
+TARGET_BUILTIN(__builtin_amdgcn_tensor_load_to_lds, "vV4iV8iV4iV4iIi", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_tensor_load_to_lds_d2, "vV4iV8iIi", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_tensor_store_from_lds, "vV4iV8iV4iV4iIi", "nc", "gfx1250-insts")
+TARGET_BUILTIN(__builtin_amdgcn_tensor_store_from_lds_d2, "vV4iV8iIi", "nc", "gfx1250-insts")
+
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr4_b64_v2i32, "V2iV2i*1", "nc", "transpose-load-f4f6-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr8_b64_v2i32, "V2iV2i*1", "nc", "gfx1250-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr6_b96_v3i32, "V3iV3i*1", "nc", "transpose-load-f4f6-insts,wavefrontsize32")
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index f09b3b92c4ea0..1fc2d57d4941c 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -621,6 +621,32 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
llvm::Function *F = CGM.getIntrinsic(IID, {LoadTy});
return Builder.CreateCall(F, {Addr});
}
+ case AMDGPU::BI__builtin_amdgcn_tensor_load_to_lds:
+ case AMDGPU::BI__builtin_amdgcn_tensor_load_to_lds_d2:
+ case AMDGPU::BI__builtin_amdgcn_tensor_store_from_lds:
+ case AMDGPU::BI__builtin_amdgcn_tensor_store_from_lds_d2: {
+ Intrinsic::ID IID;
+ switch (BuiltinID) {
+ case AMDGPU::BI__builtin_amdgcn_tensor_load_to_lds:
+ IID = Intrinsic::amdgcn_tensor_load_to_lds;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_tensor_load_to_lds_d2:
+ IID = Intrinsic::amdgcn_tensor_load_to_lds_d2;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_tensor_store_from_lds:
+ IID = Intrinsic::amdgcn_tensor_store_from_lds;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_tensor_store_from_lds_d2:
+ IID = Intrinsic::amdgcn_tensor_store_from_lds_d2;
+ break;
+ }
+
+ SmallVector<Value *, 5> Args;
+ for (int i = 0, e = E->getNumArgs(); i != e; ++i)
+ Args.push_back(EmitScalarExpr(E->getArg(i)));
+ llvm::Function *F = CGM.getIntrinsic(IID, {});
+ return Builder.CreateCall(F, {Args});
+ }
case AMDGPU::BI__builtin_amdgcn_load_to_lds: {
// Should this have asan instrumentation?
return emitBuiltinWithOneOverloadedType<5>(*this, E,
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-tensor-load-store.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-tensor-load-store.cl
new file mode 100644
index 0000000000000..49ffbf4517160
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-tensor-load-store.cl
@@ -0,0 +1,46 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1250 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1250
+
+typedef int v4i __attribute__((ext_vector_type(4)));
+typedef int v8i __attribute__((ext_vector_type(8)));
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_load_to_lds(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> [[SG2:%.*]], <4 x i32> [[SG3:%.*]], i32 0)
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_tensor_load_to_lds(v4i sg0, v8i sg1, v4i sg2, v4i sg3)
+{
+ __builtin_amdgcn_tensor_load_to_lds(sg0, sg1, sg2, sg3, 0);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_load_to_lds_d2(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], i32 27)
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_tensor_load_to_lds_d2(v4i sg0, v8i sg1)
+{
+ __builtin_amdgcn_tensor_load_to_lds_d2(sg0, sg1, 27);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_store_from_lds(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> [[SG2:%.*]], <4 x i32> [[SG3:%.*]], i32 22)
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_tensor_store_from_lds(v4i sg0, v8i sg1, v4i sg2, v4i sg3)
+{
+ __builtin_amdgcn_tensor_store_from_lds(sg0, sg1, sg2, sg3, 22);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_store_from_lds_d2(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.store.from.lds.d2(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], i32 0)
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_tensor_store_from_lds_d2(v4i sg0, v8i sg1)
+{
+ __builtin_amdgcn_tensor_store_from_lds_d2(sg0, sg1, 0);
+}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
index 69857087bae08..3ba0d50e79031 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
@@ -1,6 +1,9 @@
// REQUIRES: amdgpu-registered-target
// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx1250 -verify -S -o - %s
+typedef int v4i __attribute__((ext_vector_type(4)));
+typedef int v8i __attribute__((ext_vector_type(8)));
+
void test_setprio_inc_wg(short a) {
__builtin_amdgcn_s_setprio_inc_wg(a); // expected-error {{'__builtin_amdgcn_s_setprio_inc_wg' must be a constant integer}}
}
@@ -16,3 +19,11 @@ void test__builtin_amdgcn_cvt_f16_fp8(int a, int b) {
void test__builtin_amdgcn_cvt_f16_bf8(int a, int b) {
__builtin_amdgcn_cvt_f16_bf8(a, b); // expected-error {{'__builtin_amdgcn_cvt_f16_bf8' must be a constant integer}}
}
+
+void test_amdgcn_tensor_load_store(v4i sg0, v8i sg1, v4i sg2, v4i sg3, int cpol)
+{
+ __builtin_amdgcn_tensor_load_to_lds(sg0, sg1, sg2, sg3, cpol); // expected-error {{'__builtin_amdgcn_tensor_load_to_lds' must be a constant integer}}
+ __builtin_amdgcn_tensor_load_to_lds_d2(sg0, sg1, cpol); // expected-error {{'__builtin_amdgcn_tensor_load_to_lds_d2' must be a constant integer}}
+ __builtin_amdgcn_tensor_store_from_lds(sg0, sg1, sg2, sg3, cpol); // expected-error {{'__builtin_amdgcn_tensor_store_from_lds' must be a constant integer}}
+ __builtin_amdgcn_tensor_store_from_lds_d2(sg0, sg1, cpol); // expected-error {{'__builtin_amdgcn_tensor_store_from_lds_d2' must be a constant integer}}
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index a0a81568424f5..2aabf6109022f 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -3580,6 +3580,37 @@ def int_amdgcn_fdiv_fast : DefaultAttrsIntrinsic<
[IntrNoMem, IntrSpeculatable]
>;
+class AMDGPUTensorLoadStore:
+ Intrinsic<
+ [],
+ [llvm_v4i32_ty, // D# group 0
+ llvm_v8i32_ty, // D# group 1
+ llvm_v4i32_ty, // D# group 2
+ llvm_v4i32_ty, // D# group 3
+ llvm_i32_ty], // cachepolicy:
+ // bits [0-2] = th
+ // bits [3-4] = scope
+ [IntrInaccessibleMemOrArgMemOnly, ImmArg<ArgIndex<4>>, IntrWillReturn, IntrConvergent, IntrNoCallback, IntrNoFree],
+ "", [SDNPMemOperand]
+ >;
+
+class AMDGPUTensorLoadStoreD2:
+ Intrinsic<
+ [],
+ [llvm_v4i32_ty, // D# group 0
+ llvm_v8i32_ty, // D# group 1
+ llvm_i32_ty], // cachepolicy:
+ // bits [0-2] = th
+ // bits [3-4] = scope
+ [IntrInaccessibleMemOrArgMemOnly, ImmArg<ArgIndex<2>>, IntrWillReturn, IntrConvergent, IntrNoCallback, IntrNoFree],
+ "", [SDNPMemOperand]
+ >;
+
+def int_amdgcn_tensor_load_to_lds : AMDGPUTensorLoadStore;
+def int_amdgcn_tensor_store_from_lds : AMDGPUTensorLoadStore;
+def int_amdgcn_tensor_load_to_lds_d2 : AMDGPUTensorLoadStoreD2;
+def int_amdgcn_tensor_store_from_lds_d2 : AMDGPUTensorLoadStoreD2;
+
/// Emit an addrspacecast without null pointer checking.
/// Should only be inserted by a pass based on analysis of an addrspacecast's src.
def int_amdgcn_addrspacecast_nonnull : DefaultAttrsIntrinsic<
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 6f6d7b8d99af5..353fb23fa1520 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3348,6 +3348,20 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
MI.eraseFromParent();
return;
}
+ case Intrinsic::amdgcn_tensor_load_to_lds:
+ case Intrinsic::amdgcn_tensor_store_from_lds: {
+ constrainOpWithReadfirstlane(B, MI, 1);
+ constrainOpWithReadfirstlane(B, MI, 2);
+ constrainOpWithReadfirstlane(B, MI, 3);
+ constrainOpWithReadfirstlane(B, MI, 4);
+ return;
+ }
+ case Intrinsic::amdgcn_tensor_load_to_lds_d2:
+ case Intrinsic::amdgcn_tensor_store_from_lds_d2: {
+ constrainOpWithReadfirstlane(B, MI, 1);
+ constrainOpWithReadfirstlane(B, MI, 2);
+ return;
+ }
default: {
if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
AMDGPU::lookupRsrcIntrinsic(IntrID)) {
@@ -5354,6 +5368,22 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
}
case Intrinsic::amdgcn_pops_exiting_wave_id:
return getDefaultMappingSOP(MI);
+ case Intrinsic::amdgcn_tensor_load_to_lds_d2:
+ case Intrinsic::amdgcn_tensor_store_from_lds_d2:
+ case Intrinsic::amdgcn_tensor_load_to_lds:
+ case Intrinsic::amdgcn_tensor_store_from_lds: {
+ // Lie and claim everything is legal, even all operands need to be
+ // SGPRs. applyMapping will have to deal with it with readfirstlane.
+ for (unsigned I = 1; I < MI.getNumOperands(); ++I) {
+ if (MI.getOperand(I).isReg()) {
+ Register Reg = MI.getOperand(I).getReg();
+ auto OpBank = getRegBankID(Reg, MRI);
+ unsigned Size = getSizeInBits(Reg, MRI, *TRI);
+ OpdsMapping[I] = AMDGPU::getValueMapping(OpBank, Size);
+ }
+ }
+ break;
+ }
case Intrinsic::amdgcn_s_prefetch_data: {
OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 14fec71996a0e..c429e95f52a9d 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1784,6 +1784,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
bool validateMIMGAddrSize(const MCInst &Inst, const SMLoc &IDLoc);
bool validateMIMGD16(const MCInst &Inst);
bool validateMIMGDim(const MCInst &Inst, const OperandVector &Operands);
+ bool validateTensorR128(const MCInst &Inst);
bool validateMIMGMSAA(const MCInst &Inst);
bool validateOpSel(const MCInst &Inst);
bool validateTrue16OpSel(const MCInst &Inst);
@@ -4280,6 +4281,20 @@ bool AMDGPUAsmParser::validateMIMGD16(const MCInst &Inst) {
return true;
}
+bool AMDGPUAsmParser::validateTensorR128(const MCInst &Inst) {
+ const unsigned Opc = Inst.getOpcode();
+ const MCInstrDesc &Desc = MII.get(Opc);
+
+ if ((Desc.TSFlags & SIInstrFlags::TENSOR_CNT) == 0)
+ return true;
+
+ int R128Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::r128);
+ if (R128Idx >= 0 && Inst.getOperand(R128Idx).getImm())
+ return false;
+
+ return true;
+}
+
static bool IsRevOpcode(const unsigned Opcode)
{
switch (Opcode) {
@@ -5113,14 +5128,11 @@ bool AMDGPUAsmParser::validateTHAndScopeBits(const MCInst &Inst,
return PrintError("scope and th combination is not valid");
}
- bool IsStore = TID.mayStore();
- bool IsAtomic =
- TID.TSFlags & (SIInstrFlags::IsAtomicNoRet | SIInstrFlags::IsAtomicRet);
-
- if (IsAtomic) {
+ unsigned THType = AMDGPU::getTemporalHintType(TID);
+ if (THType == AMDGPU::CPol::TH_TYPE_ATOMIC) {
if (!(CPol & AMDGPU::CPol::TH_TYPE_ATOMIC))
return PrintError("invalid th value for atomic instructions");
- } else if (IsStore) {
+ } else if (THType == AMDGPU::CPol::TH_TYPE_STORE) {
if (!(CPol & AMDGPU::CPol::TH_TYPE_STORE))
return PrintError("invalid th value for store instructions");
} else {
@@ -5205,6 +5217,11 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
Error(IDLoc, "missing dim operand");
return false;
}
+ if (!validateTensorR128(Inst)) {
+ Error(getImmLoc(AMDGPUOperand::ImmTyD16, Operands),
+ "instruction must set modifier r128=0");
+ return false;
+ }
if (!validateMIMGMSAA(Inst)) {
Error(getImmLoc(AMDGPUOperand::ImmTyDim, Operands),
"invalid dim; must be MSAA type");
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index a6ce42dca92be..fa1474d153834 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -173,13 +173,12 @@ void AMDGPUInstPrinter::printTH(const MCInst *MI, int64_t TH, int64_t Scope,
const unsigned Opcode = MI->getOpcode();
const MCInstrDesc &TID = MII.get(Opcode);
- bool IsStore = TID.mayStore();
- bool IsAtomic =
- TID.TSFlags & (SIInstrFlags::IsAtomicNoRet | SIInstrFlags::IsAtomicRet);
+ unsigned THType = AMDGPU::getTemporalHintType(TID);
+ bool IsStore = (THType == AMDGPU::CPol::TH_TYPE_STORE);
O << " th:";
- if (IsAtomic) {
+ if (THType == AMDGPU::CPol::TH_TYPE_ATOMIC) {
O << "TH_ATOMIC_";
if (TH & AMDGPU::CPol::TH_ATOMIC_CASCADE) {
if (Scope >= AMDGPU::CPol::SCOPE_DEV)
@@ -196,9 +195,6 @@ void AMDGPUInstPrinter::printTH(const MCInst *MI, int64_t TH, int64_t Scope,
if (!IsStore && TH == AMDGPU::CPol::TH_RESERVED)
O << formatHex(TH);
else {
- // This will default to printing load variants when neither MayStore nor
- // MayLoad flag is present which is the case with instructions like
- // image_get_resinfo.
O << (IsStore ? "TH_STORE_" : "TH_LOAD_");
switch (TH) {
case AMDGPU::CPol::TH_NT:
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index 8d94d73bc1aab..531fae3ceff59 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -2019,3 +2019,97 @@ def : MIMGG16Mapping<IMAGE_SAMPLE_CD_O_nortn, IMAGE_SAMPLE_CD_O_G16_nortn>;
def : MIMGG16Mapping<IMAGE_SAMPLE_CD_CL_O_nortn, IMAGE_SAMPLE_CD_CL_O_G16_nortn>;
def : MIMGG16Mapping<IMAGE_SAMPLE_C_CD_O_nortn, IMAGE_SAMPLE_C_CD_O_G16_nortn>;
def : MIMGG16Mapping<IMAGE_SAMPLE_C_CD_CL_O_nortn, IMAGE_SAMPLE_C_CD_CL_O_G16_nortn>;
+
+//===----------------------------------------------------------------------===//
+// VIMAGE Tensor Instructions
+//===----------------------------------------------------------------------===//
+
+class VIMAGE_TENSOR_Pseudo<string opName, bit _UpTo2D = 0> :
+ InstSI<(outs ), (ins ), "", []>,
+ SIMCInstr<opName#!if(_UpTo2D, "_D2", ""), SIEncodingFamily.NONE> {
+
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+ string Mnemonic = opName;
+
+ let VALU = 1;
+ let maybeAtomic = 0;
+ let TENSOR_CNT = 1;
+ let mayLoad = 1;
+ let mayStore = 1;
+ let Uses = [EXEC, TENSORcnt];
+ let Defs = [TENSORcnt];
+ let SchedRW = [WriteVMEM, WriteLDS];
+ let UseNamedOperandTable = 1;
+ let hasSideEffects = 0;
+
+ bit UpTo2D = _UpTo2D;
+ let InOperandList = !if(UpTo2D, (ins SReg_128:$vaddr0, SReg_256:$vaddr1, R128A16:$r128, CPol:$cpol),
+ (ins SReg_128:$vaddr0, SReg_256:$vaddr1, SReg_128:$vaddr2,
+ SReg_128:$vaddr3, R128A16:$r128, CPol:$cpol));
+ string AsmOperands = " $vaddr0, $vaddr1"#!if(UpTo2D, "", ", $vaddr2, $vaddr3")#"$r128$cpol";
+}
+
+let SubtargetPredicate = isGFX1250Plus in {
+def TENSOR_LOAD_TO_LDS : VIMAGE_TENSOR_Pseudo<"tensor_load_to_lds">;
+def TENSOR_STORE_FROM_LDS : VIMAGE_TENSOR_Pseudo<"tensor_store_from_lds">;
+def TENSOR_LOAD_TO_LDS_D2 : VIMAGE_TENSOR_Pseudo<"tensor_load_to_lds", 1>;
+def TENSOR_STORE_FROM_LDS_D2 : VIMAGE_TENSOR_Pseudo<"tensor_store_from_lds", 1>;
+} // End SubtargetPredicate = isGFX1250Plus.
+
+class TensorPat <VIMAGE_TENSOR_Pseudo inst, SDPatternOperator node> : GCNPat <
+ (node v4i32:$vaddr0, v8i32:$vaddr1, v4i32:$vaddr2, v4i32:$vaddr3, (i32 timm:$cpol)),
+ (inst $vaddr0, $vaddr1, $vaddr2, $vaddr3, 0, $cpol)
+>;
+
+class TensorD2Pat <VIMAGE_TENSOR_Pseudo inst, SDPatternOperator node> : GCNPat <
+ (node v4i32:$vaddr0, v8i32:$vaddr1, (i32 timm:$cpol)),
+ (inst $vaddr0, $vaddr1, 0, $cpol)
+>;
+
+let SubtargetPredicate = isGFX1250Plus in {
+def : TensorPat <TENSOR_LOAD_TO_LDS, int_amdgcn_tensor_load_to_lds>;
+def : TensorPat <TENSOR_STORE_FROM_LDS, int_amdgcn_tensor_store_from_lds>;
+def : TensorD2Pat <TENSOR_LOAD_TO_LDS_D2, int_amdgcn_tensor_load_to_lds_d2>;
+def : TensorD2Pat <TENSOR_STORE_FROM_LDS_D2, int_amdgcn_tensor_store_from_lds_d2>;
+}
+
+class VIMAGE_TENSOR_Real <bits<8> op, VIMAGE_TENSOR_Pseudo ps, string opName = ps.Mnemonic> :
+ InstSI <ps.OutOperandList, ps.InOperandList, opName # ps.AsmOperands, []>,
+ VIMAGEe<op> {
+
+ // copy relevant pseudo op flags
+ let SubtargetPredicate = ps.SubtargetPredicate;
+ let TSFlags = ps.TSFlags;
+ let mayLoad = ps.mayLoad;
+ let mayStore = ps.mayStore;
+ let UseNamedOperandTable = ps.UseNamedOperandTable;
+ let SchedRW = ps.SchedRW;
+
+ // D# group 2 and 3 set to NULL for 2D or less.
+ let vaddr2 = !if(ps.UpTo2D, !cast<int>(SGPR_NULL_gfx11plus.HWEncoding), ?);
+ let vaddr3 = !if(ps.UpTo2D, !cast<int>(SGPR_NULL_gfx11plus.HWEncoding), ?);
+
+ // set to 0 based on SPG.
+ let vaddr4 = 0;
+ let rsrc = 0;
+ let vdata = 0;
+ let d16 = 0;
+ let a16 = 0;
+ let tfe = 0;
+ let dmask = 1; // sp3
+ let dim = 1; // sp3
+}
+
+multiclass VIMAGE_TENSOR_Real_gfx1250<bits<8> op> {
+ let AssemblerPredicate = isGFX1250Plus, DecoderNamespace = "GFX1250" in {
+ foreach DSuffix = ["_D2", ""] in {
+ defvar ps = !cast<VIMAGE_TENSOR_Pseudo>(NAME # DSuffix);
+ def DSuffix # _gfx1250 : VIMAGE_TENSOR_Real<op, ps, ps.Mnemonic>,
+ SIMCInstr<ps.PseudoInstr, SIEncodingFamily.GFX1250>;
+ }
+ }
+}
+
+defm TENSOR_LOAD_TO_LDS : VIMAGE_TENSOR_Real_gfx1250<0xc4>;
+defm TENSOR_STORE_FROM_LDS : VIMAGE_TENSOR_Real_gfx1250<0xc5>;
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 4b72f66abbd76..76e29e4393206 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -107,8 +107,7 @@ enum : uint64_t {
DisableWQM = UINT64_C(1) << 36,
Gather4 = UINT64_C(1) << 37,
- // Reserved, must be 0.
- Reserved0 = UINT64_C(1) << 38,
+ TENSOR_CNT = UINT64_C(1) << 38,
SCALAR_STORE = UINT64_C(1) << 39,
FIXED_SIZE = UINT64_C(1) << 40,
diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
index 42aae35112cac..c27d4e0df6fc5 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -68,6 +68,9 @@ class InstSI <dag outs, dag ins, string asm = "",
field bit Gather4 = 0;
+ // wait count to manage tensor loads/stores.
+ field bit TENSOR_CNT = 0;
+
// This is an s_store_dword* instruction that requires a cache flush
// on wave termination. It is necessary to distinguish from mayStore
// SMEM instructions like the cache flush ones.
@@ -201,8 +204,7 @@ class InstSI <dag outs, dag ins, string asm = "",
let TSFlags{36} = DisableWQM;
let TSFlags{37} = Gather4;
- // Reserved, must be 0.
- let TSFlags{38} = 0;
+ let TSFlags{38} = TENSOR_CNT;
let TSFlags{39} = ScalarStore;
let TSFlags{40} = FixedSize;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/li...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fix clang-format error
for (unsigned I = 1; I < MI.getNumOperands(); ++I) { | ||
if (MI.getOperand(I).isReg()) { | ||
Register Reg = MI.getOperand(I).getReg(); | ||
auto OpBank = getRegBankID(Reg, MRI); | ||
unsigned Size = getSizeInBits(Reg, MRI, *TRI); | ||
OpdsMapping[I] = AMDGPU::getValueMapping(OpBank, Size); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
for (unsigned I = 1; I < MI.getNumOperands(); ++I) { | |
if (MI.getOperand(I).isReg()) { | |
Register Reg = MI.getOperand(I).getReg(); | |
auto OpBank = getRegBankID(Reg, MRI); | |
unsigned Size = getSizeInBits(Reg, MRI, *TRI); | |
OpdsMapping[I] = AMDGPU::getValueMapping(OpBank, Size); | |
} | |
} | |
for (MachineOperand &MO : MI.all_uses()) { | |
Register Reg = MO.getReg(); | |
auto OpBank = getRegBankID(Reg, MRI); | |
unsigned Size = getSizeInBits(Reg, MRI, *TRI); | |
OpdsMapping[I] = AMDGPU::getValueMapping(OpBank, Size); | |
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The suggested code does not compile. Thanks
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
sorry. Updated the code accordingly.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
sorry. Updated the code accordingly.
What is the value of "I" then? We need a way to map the operand back to the index.
A few update based on review comments: 1. fix clang format 2. use ClangBuiltin in intrinsic defs to avoid the boilerplate in TargetBuiltins/AMDGPU.cpp 3. Fold to return of bool expression
Minor format change.
No description provided.