Skip to content

AMDGPU: Implement tensor load and store instructions for gfx1250 #146636

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions clang/include/clang/Basic/BuiltinsAMDGPU.def
Original file line number Diff line number Diff line change
Expand Up @@ -640,6 +640,11 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_sr_f16_f32, "V2hV2hfUiIb", "nc", "f32-to-f16
// GFX1250+ only builtins.
//===----------------------------------------------------------------------===//

TARGET_BUILTIN(__builtin_amdgcn_tensor_load_to_lds, "vV4iV8iV4iV4iIi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_tensor_load_to_lds_d2, "vV4iV8iIi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_tensor_store_from_lds, "vV4iV8iV4iV4iIi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_tensor_store_from_lds_d2, "vV4iV8iIi", "nc", "gfx1250-insts")

TARGET_BUILTIN(__builtin_amdgcn_global_load_tr4_b64_v2i32, "V2iV2i*1", "nc", "transpose-load-f4f6-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr8_b64_v2i32, "V2iV2i*1", "nc", "gfx1250-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr6_b96_v3i32, "V3iV3i*1", "nc", "transpose-load-f4f6-insts,wavefrontsize32")
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
// REQUIRES: amdgpu-registered-target
// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1250 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1250

typedef int v4i __attribute__((ext_vector_type(4)));
typedef int v8i __attribute__((ext_vector_type(8)));

// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_load_to_lds(
// CHECK-GFX1250-NEXT: entry:
// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.load.to.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> [[SG2:%.*]], <4 x i32> [[SG3:%.*]], i32 0)
// CHECK-GFX1250-NEXT: ret void
//
void test_amdgcn_tensor_load_to_lds(v4i sg0, v8i sg1, v4i sg2, v4i sg3)
{
__builtin_amdgcn_tensor_load_to_lds(sg0, sg1, sg2, sg3, 0);
}

// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_load_to_lds_d2(
// CHECK-GFX1250-NEXT: entry:
// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.load.to.lds.d2(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], i32 27)
// CHECK-GFX1250-NEXT: ret void
//
void test_amdgcn_tensor_load_to_lds_d2(v4i sg0, v8i sg1)
{
__builtin_amdgcn_tensor_load_to_lds_d2(sg0, sg1, 27);
}

// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_store_from_lds(
// CHECK-GFX1250-NEXT: entry:
// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.store.from.lds(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], <4 x i32> [[SG2:%.*]], <4 x i32> [[SG3:%.*]], i32 22)
// CHECK-GFX1250-NEXT: ret void
//
void test_amdgcn_tensor_store_from_lds(v4i sg0, v8i sg1, v4i sg2, v4i sg3)
{
__builtin_amdgcn_tensor_store_from_lds(sg0, sg1, sg2, sg3, 22);
}

// CHECK-GFX1250-LABEL: @test_amdgcn_tensor_store_from_lds_d2(
// CHECK-GFX1250-NEXT: entry:
// CHECK-GFX1250-NEXT: tail call void @llvm.amdgcn.tensor.store.from.lds.d2(<4 x i32> [[SG0:%.*]], <8 x i32> [[SG1:%.*]], i32 0)
// CHECK-GFX1250-NEXT: ret void
//
void test_amdgcn_tensor_store_from_lds_d2(v4i sg0, v8i sg1)
{
__builtin_amdgcn_tensor_store_from_lds_d2(sg0, sg1, 0);
}
11 changes: 11 additions & 0 deletions clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
// REQUIRES: amdgpu-registered-target
// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx1250 -verify -S -o - %s

typedef int v4i __attribute__((ext_vector_type(4)));
typedef int v8i __attribute__((ext_vector_type(8)));

void test_setprio_inc_wg(short a) {
__builtin_amdgcn_s_setprio_inc_wg(a); // expected-error {{'__builtin_amdgcn_s_setprio_inc_wg' must be a constant integer}}
}
Expand All @@ -16,3 +19,11 @@ void test__builtin_amdgcn_cvt_f16_fp8(int a, int b) {
void test__builtin_amdgcn_cvt_f16_bf8(int a, int b) {
__builtin_amdgcn_cvt_f16_bf8(a, b); // expected-error {{'__builtin_amdgcn_cvt_f16_bf8' must be a constant integer}}
}

void test_amdgcn_tensor_load_store(v4i sg0, v8i sg1, v4i sg2, v4i sg3, int cpol)
{
__builtin_amdgcn_tensor_load_to_lds(sg0, sg1, sg2, sg3, cpol); // expected-error {{'__builtin_amdgcn_tensor_load_to_lds' must be a constant integer}}
__builtin_amdgcn_tensor_load_to_lds_d2(sg0, sg1, cpol); // expected-error {{'__builtin_amdgcn_tensor_load_to_lds_d2' must be a constant integer}}
__builtin_amdgcn_tensor_store_from_lds(sg0, sg1, sg2, sg3, cpol); // expected-error {{'__builtin_amdgcn_tensor_store_from_lds' must be a constant integer}}
__builtin_amdgcn_tensor_store_from_lds_d2(sg0, sg1, cpol); // expected-error {{'__builtin_amdgcn_tensor_store_from_lds_d2' must be a constant integer}}
}
35 changes: 35 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -3580,6 +3580,41 @@ def int_amdgcn_fdiv_fast : DefaultAttrsIntrinsic<
[IntrNoMem, IntrSpeculatable]
>;

class AMDGPUTensorLoadStore:
Intrinsic<
[],
[llvm_v4i32_ty, // D# group 0
llvm_v8i32_ty, // D# group 1
llvm_v4i32_ty, // D# group 2
llvm_v4i32_ty, // D# group 3
llvm_i32_ty], // cachepolicy:
// bits [0-2] = th
// bits [3-4] = scope
[IntrInaccessibleMemOrArgMemOnly, ImmArg<ArgIndex<4>>, IntrWillReturn, IntrConvergent, IntrNoCallback, IntrNoFree],
"", [SDNPMemOperand]
>;

class AMDGPUTensorLoadStoreD2:
Intrinsic<
[],
[llvm_v4i32_ty, // D# group 0
llvm_v8i32_ty, // D# group 1
llvm_i32_ty], // cachepolicy:
// bits [0-2] = th
// bits [3-4] = scope
[IntrInaccessibleMemOrArgMemOnly, ImmArg<ArgIndex<2>>, IntrWillReturn, IntrConvergent, IntrNoCallback, IntrNoFree],
"", [SDNPMemOperand]
>;

def int_amdgcn_tensor_load_to_lds :
ClangBuiltin<"__builtin_amdgcn_tensor_load_to_lds">, AMDGPUTensorLoadStore;
def int_amdgcn_tensor_store_from_lds :
ClangBuiltin<"__builtin_amdgcn_tensor_store_from_lds">, AMDGPUTensorLoadStore;
def int_amdgcn_tensor_load_to_lds_d2 :
ClangBuiltin<"__builtin_amdgcn_tensor_load_to_lds_d2">, AMDGPUTensorLoadStoreD2;
def int_amdgcn_tensor_store_from_lds_d2 :
ClangBuiltin<"__builtin_amdgcn_tensor_store_from_lds_d2">, AMDGPUTensorLoadStoreD2;

/// Emit an addrspacecast without null pointer checking.
/// Should only be inserted by a pass based on analysis of an addrspacecast's src.
def int_amdgcn_addrspacecast_nonnull : DefaultAttrsIntrinsic<
Expand Down
30 changes: 30 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3348,6 +3348,20 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
MI.eraseFromParent();
return;
}
case Intrinsic::amdgcn_tensor_load_to_lds:
case Intrinsic::amdgcn_tensor_store_from_lds: {
constrainOpWithReadfirstlane(B, MI, 1);
constrainOpWithReadfirstlane(B, MI, 2);
constrainOpWithReadfirstlane(B, MI, 3);
constrainOpWithReadfirstlane(B, MI, 4);
return;
}
case Intrinsic::amdgcn_tensor_load_to_lds_d2:
case Intrinsic::amdgcn_tensor_store_from_lds_d2: {
constrainOpWithReadfirstlane(B, MI, 1);
constrainOpWithReadfirstlane(B, MI, 2);
return;
}
default: {
if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
AMDGPU::lookupRsrcIntrinsic(IntrID)) {
Expand Down Expand Up @@ -5354,6 +5368,22 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
}
case Intrinsic::amdgcn_pops_exiting_wave_id:
return getDefaultMappingSOP(MI);
case Intrinsic::amdgcn_tensor_load_to_lds_d2:
case Intrinsic::amdgcn_tensor_store_from_lds_d2:
case Intrinsic::amdgcn_tensor_load_to_lds:
case Intrinsic::amdgcn_tensor_store_from_lds: {
// Lie and claim everything is legal, even all operands need to be
// SGPRs. applyMapping will have to deal with it with readfirstlane.
for (unsigned I = 1; I < MI.getNumOperands(); ++I) {
if (MI.getOperand(I).isReg()) {
Register Reg = MI.getOperand(I).getReg();
auto OpBank = getRegBankID(Reg, MRI);
unsigned Size = getSizeInBits(Reg, MRI, *TRI);
OpdsMapping[I] = AMDGPU::getValueMapping(OpBank, Size);
}
}
Comment on lines +5377 to +5384
Copy link
Member

@tgymnich tgymnich Jul 2, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
for (unsigned I = 1; I < MI.getNumOperands(); ++I) {
if (MI.getOperand(I).isReg()) {
Register Reg = MI.getOperand(I).getReg();
auto OpBank = getRegBankID(Reg, MRI);
unsigned Size = getSizeInBits(Reg, MRI, *TRI);
OpdsMapping[I] = AMDGPU::getValueMapping(OpBank, Size);
}
}
for (MachineOperand &MO : MI.all_uses()) {
Register Reg = MO.getReg();
auto OpBank = getRegBankID(Reg, MRI);
unsigned Size = getSizeInBits(Reg, MRI, *TRI);
OpdsMapping[I] = AMDGPU::getValueMapping(OpBank, Size);
}

Copy link
Contributor Author

@changpeng changpeng Jul 2, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The suggested code does not compile. Thanks

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sorry. Updated the code accordingly.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sorry. Updated the code accordingly.

What is the value of "I" then? We need a way to map the operand back to the index.

break;
}
case Intrinsic::amdgcn_s_prefetch_data: {
OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
Expand Down
27 changes: 21 additions & 6 deletions llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1784,6 +1784,7 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
bool validateMIMGAddrSize(const MCInst &Inst, const SMLoc &IDLoc);
bool validateMIMGD16(const MCInst &Inst);
bool validateMIMGDim(const MCInst &Inst, const OperandVector &Operands);
bool validateTensorR128(const MCInst &Inst);
bool validateMIMGMSAA(const MCInst &Inst);
bool validateOpSel(const MCInst &Inst);
bool validateTrue16OpSel(const MCInst &Inst);
Expand Down Expand Up @@ -4280,6 +4281,18 @@ bool AMDGPUAsmParser::validateMIMGD16(const MCInst &Inst) {
return true;
}

bool AMDGPUAsmParser::validateTensorR128(const MCInst &Inst) {
const unsigned Opc = Inst.getOpcode();
const MCInstrDesc &Desc = MII.get(Opc);

if ((Desc.TSFlags & SIInstrFlags::TENSOR_CNT) == 0)
return true;

int R128Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::r128);

return R128Idx < 0 || !Inst.getOperand(R128Idx).getImm();
}

static bool IsRevOpcode(const unsigned Opcode)
{
switch (Opcode) {
Expand Down Expand Up @@ -5113,14 +5126,11 @@ bool AMDGPUAsmParser::validateTHAndScopeBits(const MCInst &Inst,
return PrintError("scope and th combination is not valid");
}

bool IsStore = TID.mayStore();
bool IsAtomic =
TID.TSFlags & (SIInstrFlags::IsAtomicNoRet | SIInstrFlags::IsAtomicRet);

if (IsAtomic) {
unsigned THType = AMDGPU::getTemporalHintType(TID);
if (THType == AMDGPU::CPol::TH_TYPE_ATOMIC) {
if (!(CPol & AMDGPU::CPol::TH_TYPE_ATOMIC))
return PrintError("invalid th value for atomic instructions");
} else if (IsStore) {
} else if (THType == AMDGPU::CPol::TH_TYPE_STORE) {
if (!(CPol & AMDGPU::CPol::TH_TYPE_STORE))
return PrintError("invalid th value for store instructions");
} else {
Expand Down Expand Up @@ -5205,6 +5215,11 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
Error(IDLoc, "missing dim operand");
return false;
}
if (!validateTensorR128(Inst)) {
Error(getImmLoc(AMDGPUOperand::ImmTyD16, Operands),
"instruction must set modifier r128=0");
return false;
}
if (!validateMIMGMSAA(Inst)) {
Error(getImmLoc(AMDGPUOperand::ImmTyDim, Operands),
"invalid dim; must be MSAA type");
Expand Down
10 changes: 3 additions & 7 deletions llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -173,13 +173,12 @@ void AMDGPUInstPrinter::printTH(const MCInst *MI, int64_t TH, int64_t Scope,

const unsigned Opcode = MI->getOpcode();
const MCInstrDesc &TID = MII.get(Opcode);
bool IsStore = TID.mayStore();
bool IsAtomic =
TID.TSFlags & (SIInstrFlags::IsAtomicNoRet | SIInstrFlags::IsAtomicRet);
unsigned THType = AMDGPU::getTemporalHintType(TID);
bool IsStore = (THType == AMDGPU::CPol::TH_TYPE_STORE);

O << " th:";

if (IsAtomic) {
if (THType == AMDGPU::CPol::TH_TYPE_ATOMIC) {
O << "TH_ATOMIC_";
if (TH & AMDGPU::CPol::TH_ATOMIC_CASCADE) {
if (Scope >= AMDGPU::CPol::SCOPE_DEV)
Expand All @@ -196,9 +195,6 @@ void AMDGPUInstPrinter::printTH(const MCInst *MI, int64_t TH, int64_t Scope,
if (!IsStore && TH == AMDGPU::CPol::TH_RESERVED)
O << formatHex(TH);
else {
// This will default to printing load variants when neither MayStore nor
// MayLoad flag is present which is the case with instructions like
// image_get_resinfo.
O << (IsStore ? "TH_STORE_" : "TH_LOAD_");
switch (TH) {
case AMDGPU::CPol::TH_NT:
Expand Down
94 changes: 94 additions & 0 deletions llvm/lib/Target/AMDGPU/MIMGInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -2019,3 +2019,97 @@ def : MIMGG16Mapping<IMAGE_SAMPLE_CD_O_nortn, IMAGE_SAMPLE_CD_O_G16_nortn>;
def : MIMGG16Mapping<IMAGE_SAMPLE_CD_CL_O_nortn, IMAGE_SAMPLE_CD_CL_O_G16_nortn>;
def : MIMGG16Mapping<IMAGE_SAMPLE_C_CD_O_nortn, IMAGE_SAMPLE_C_CD_O_G16_nortn>;
def : MIMGG16Mapping<IMAGE_SAMPLE_C_CD_CL_O_nortn, IMAGE_SAMPLE_C_CD_CL_O_G16_nortn>;

//===----------------------------------------------------------------------===//
// VIMAGE Tensor Instructions
//===----------------------------------------------------------------------===//

class VIMAGE_TENSOR_Pseudo<string opName, bit _UpTo2D = 0> :
InstSI<(outs ), (ins ), "", []>,
SIMCInstr<opName#!if(_UpTo2D, "_D2", ""), SIEncodingFamily.NONE> {

let isPseudo = 1;
let isCodeGenOnly = 1;
string Mnemonic = opName;

let VALU = 1;
let maybeAtomic = 0;
let TENSOR_CNT = 1;
let mayLoad = 1;
let mayStore = 1;
let Uses = [EXEC, TENSORcnt];
let Defs = [TENSORcnt];
let SchedRW = [WriteVMEM, WriteLDS];
let UseNamedOperandTable = 1;
let hasSideEffects = 0;

bit UpTo2D = _UpTo2D;
let InOperandList = !if(UpTo2D, (ins SReg_128:$vaddr0, SReg_256:$vaddr1, R128A16:$r128, CPol:$cpol),
(ins SReg_128:$vaddr0, SReg_256:$vaddr1, SReg_128:$vaddr2,
SReg_128:$vaddr3, R128A16:$r128, CPol:$cpol));
string AsmOperands = " $vaddr0, $vaddr1"#!if(UpTo2D, "", ", $vaddr2, $vaddr3")#"$r128$cpol";
}

let SubtargetPredicate = isGFX1250Plus in {
def TENSOR_LOAD_TO_LDS : VIMAGE_TENSOR_Pseudo<"tensor_load_to_lds">;
def TENSOR_STORE_FROM_LDS : VIMAGE_TENSOR_Pseudo<"tensor_store_from_lds">;
def TENSOR_LOAD_TO_LDS_D2 : VIMAGE_TENSOR_Pseudo<"tensor_load_to_lds", 1>;
def TENSOR_STORE_FROM_LDS_D2 : VIMAGE_TENSOR_Pseudo<"tensor_store_from_lds", 1>;
} // End SubtargetPredicate = isGFX1250Plus.

class TensorPat <VIMAGE_TENSOR_Pseudo inst, SDPatternOperator node> : GCNPat <
(node v4i32:$vaddr0, v8i32:$vaddr1, v4i32:$vaddr2, v4i32:$vaddr3, (i32 timm:$cpol)),
(inst $vaddr0, $vaddr1, $vaddr2, $vaddr3, 0, $cpol)
>;

class TensorD2Pat <VIMAGE_TENSOR_Pseudo inst, SDPatternOperator node> : GCNPat <
(node v4i32:$vaddr0, v8i32:$vaddr1, (i32 timm:$cpol)),
(inst $vaddr0, $vaddr1, 0, $cpol)
>;

let SubtargetPredicate = isGFX1250Plus in {
def : TensorPat <TENSOR_LOAD_TO_LDS, int_amdgcn_tensor_load_to_lds>;
def : TensorPat <TENSOR_STORE_FROM_LDS, int_amdgcn_tensor_store_from_lds>;
def : TensorD2Pat <TENSOR_LOAD_TO_LDS_D2, int_amdgcn_tensor_load_to_lds_d2>;
def : TensorD2Pat <TENSOR_STORE_FROM_LDS_D2, int_amdgcn_tensor_store_from_lds_d2>;
}

class VIMAGE_TENSOR_Real <bits<8> op, VIMAGE_TENSOR_Pseudo ps, string opName = ps.Mnemonic> :
InstSI <ps.OutOperandList, ps.InOperandList, opName # ps.AsmOperands, []>,
VIMAGEe<op> {

// copy relevant pseudo op flags
let SubtargetPredicate = ps.SubtargetPredicate;
let TSFlags = ps.TSFlags;
let mayLoad = ps.mayLoad;
let mayStore = ps.mayStore;
let UseNamedOperandTable = ps.UseNamedOperandTable;
let SchedRW = ps.SchedRW;

// D# group 2 and 3 set to NULL for 2D or less.
let vaddr2 = !if(ps.UpTo2D, !cast<int>(SGPR_NULL_gfx11plus.HWEncoding), ?);
let vaddr3 = !if(ps.UpTo2D, !cast<int>(SGPR_NULL_gfx11plus.HWEncoding), ?);

// set to 0 based on SPG.
let vaddr4 = 0;
let rsrc = 0;
let vdata = 0;
let d16 = 0;
let a16 = 0;
let tfe = 0;
let dmask = 1; // sp3
let dim = 1; // sp3
}

multiclass VIMAGE_TENSOR_Real_gfx1250<bits<8> op> {
let AssemblerPredicate = isGFX1250Plus, DecoderNamespace = "GFX1250" in {
foreach DSuffix = ["_D2", ""] in {
defvar ps = !cast<VIMAGE_TENSOR_Pseudo>(NAME # DSuffix);
def DSuffix # _gfx1250 : VIMAGE_TENSOR_Real<op, ps, ps.Mnemonic>,
SIMCInstr<ps.PseudoInstr, SIEncodingFamily.GFX1250>;
}
}
}

defm TENSOR_LOAD_TO_LDS : VIMAGE_TENSOR_Real_gfx1250<0xc4>;
defm TENSOR_STORE_FROM_LDS : VIMAGE_TENSOR_Real_gfx1250<0xc5>;
3 changes: 1 addition & 2 deletions llvm/lib/Target/AMDGPU/SIDefines.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,7 @@ enum : uint64_t {
DisableWQM = UINT64_C(1) << 36,
Gather4 = UINT64_C(1) << 37,

// Reserved, must be 0.
Reserved0 = UINT64_C(1) << 38,
TENSOR_CNT = UINT64_C(1) << 38,

SCALAR_STORE = UINT64_C(1) << 39,
FIXED_SIZE = UINT64_C(1) << 40,
Expand Down
6 changes: 4 additions & 2 deletions llvm/lib/Target/AMDGPU/SIInstrFormats.td
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,9 @@ class InstSI <dag outs, dag ins, string asm = "",

field bit Gather4 = 0;

// wait count to manage tensor loads/stores.
field bit TENSOR_CNT = 0;

// This is an s_store_dword* instruction that requires a cache flush
// on wave termination. It is necessary to distinguish from mayStore
// SMEM instructions like the cache flush ones.
Expand Down Expand Up @@ -201,8 +204,7 @@ class InstSI <dag outs, dag ins, string asm = "",
let TSFlags{36} = DisableWQM;
let TSFlags{37} = Gather4;

// Reserved, must be 0.
let TSFlags{38} = 0;
let TSFlags{38} = TENSOR_CNT;

let TSFlags{39} = ScalarStore;
let TSFlags{40} = FixedSize;
Expand Down
13 changes: 13 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7050,6 +7050,19 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
return nullptr;
}

// Legalize TENSOR_LOAD_TO_LDS, TENSOR_LOAD_TO_LDS_D2, TENSOR_STORE_FROM_LDS,
// TENSOR_STORE_FROM_LDS_D2. All their operands are scalar.
if (MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS ||
MI.getOpcode() == AMDGPU::TENSOR_LOAD_TO_LDS_D2 ||
MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS ||
MI.getOpcode() == AMDGPU::TENSOR_STORE_FROM_LDS_D2) {
for (MachineOperand &Src : MI.explicit_operands()) {
if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
}
return CreatedBB;
}

// Legalize MUBUF instructions.
bool isSoffsetLegal = true;
int SoffsetIdx =
Expand Down
Loading