Skip to content

Commit dee3190

Browse files
committed
[AMDGPU] Add llvm.amdgcn.global.load.lds intrinsic
Differential Revision: https://reviews.llvm.org/D125279
1 parent 6aabf60 commit dee3190

File tree

7 files changed

+447
-18
lines changed

7 files changed

+447
-18
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1811,6 +1811,25 @@ def int_amdgcn_perm :
18111811
Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
18121812
[IntrNoMem, IntrSpeculatable, IntrWillReturn]>;
18131813

1814+
//===----------------------------------------------------------------------===//
1815+
// GFX9 Intrinsics
1816+
//===----------------------------------------------------------------------===//
1817+
1818+
class AMDGPUGlobalLoadLDS : Intrinsic <
1819+
[],
1820+
[LLVMQualPointerType<llvm_i8_ty, 1>, // Base global pointer to load from
1821+
LLVMQualPointerType<llvm_i8_ty, 3>, // LDS base pointer to store to
1822+
llvm_i32_ty, // Data byte size: 1/2/4
1823+
llvm_i32_ty, // imm offset (applied to both global and LDS address)
1824+
llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc/sc0,
1825+
// bit 1 = slc/sc1,
1826+
// bit 2 = dlc on gfx10+))
1827+
// bit 4 = scc/nt on gfx90a+))
1828+
[IntrWillReturn, NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
1829+
ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>],
1830+
"", [SDNPMemOperand]>;
1831+
def int_amdgcn_global_load_lds : AMDGPUGlobalLoadLDS;
1832+
18141833
//===----------------------------------------------------------------------===//
18151834
// GFX10 Intrinsics
18161835
//===----------------------------------------------------------------------===//

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 102 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1783,6 +1783,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
17831783
case Intrinsic::amdgcn_raw_buffer_load_lds:
17841784
case Intrinsic::amdgcn_struct_buffer_load_lds:
17851785
return selectBufferLoadLds(I);
1786+
case Intrinsic::amdgcn_global_load_lds:
1787+
return selectGlobalLoadLds(I);
17861788
default: {
17871789
return selectImpl(I, *CoverageInfo);
17881790
}
@@ -3149,6 +3151,106 @@ bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
31493151
return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
31503152
}
31513153

3154+
/// Match a zero extend from a 32-bit value to 64-bits.
3155+
static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) {
3156+
Register ZExtSrc;
3157+
if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
3158+
return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3159+
3160+
// Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3161+
const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
3162+
if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3163+
return false;
3164+
3165+
if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
3166+
return Def->getOperand(1).getReg();
3167+
}
3168+
3169+
return Register();
3170+
}
3171+
3172+
bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{
3173+
unsigned Opc;
3174+
unsigned Size = MI.getOperand(3).getImm();
3175+
3176+
switch (Size) {
3177+
default:
3178+
return false;
3179+
case 1:
3180+
Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
3181+
break;
3182+
case 2:
3183+
Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
3184+
break;
3185+
case 4:
3186+
Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
3187+
break;
3188+
}
3189+
3190+
MachineBasicBlock *MBB = MI.getParent();
3191+
const DebugLoc &DL = MI.getDebugLoc();
3192+
BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
3193+
.add(MI.getOperand(2));
3194+
3195+
Register Addr = MI.getOperand(1).getReg();
3196+
Register VOffset;
3197+
// Try to split SAddr and VOffset. Global and LDS pointers share the same
3198+
// immediate offset, so we cannot use a regular SelectGlobalSAddr().
3199+
if (!isSGPR(Addr)) {
3200+
auto AddrDef = getDefSrcRegIgnoringCopies(Addr, *MRI);
3201+
if (isSGPR(AddrDef->Reg)) {
3202+
Addr = AddrDef->Reg;
3203+
} else if (AddrDef->MI->getOpcode() == AMDGPU::G_PTR_ADD) {
3204+
Register SAddr =
3205+
getSrcRegIgnoringCopies(AddrDef->MI->getOperand(1).getReg(), *MRI);
3206+
if (SAddr && isSGPR(SAddr)) {
3207+
Register PtrBaseOffset = AddrDef->MI->getOperand(2).getReg();
3208+
if (Register Off = matchZeroExtendFromS32(*MRI, PtrBaseOffset)) {
3209+
Addr = SAddr;
3210+
VOffset = Off;
3211+
}
3212+
}
3213+
}
3214+
}
3215+
3216+
if (isSGPR(Addr)) {
3217+
Opc = AMDGPU::getGlobalSaddrOp(Opc);
3218+
if (!VOffset) {
3219+
VOffset = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
3220+
BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_MOV_B32_e32), VOffset)
3221+
.addImm(0);
3222+
}
3223+
}
3224+
3225+
auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc))
3226+
.addReg(Addr);
3227+
3228+
if (isSGPR(Addr))
3229+
MIB.addReg(VOffset);
3230+
3231+
MIB.add(MI.getOperand(4)) // offset
3232+
.add(MI.getOperand(5)); // cpol
3233+
3234+
MachineMemOperand *LoadMMO = *MI.memoperands_begin();
3235+
MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
3236+
LoadPtrI.Offset = MI.getOperand(4).getImm();
3237+
MachinePointerInfo StorePtrI = LoadPtrI;
3238+
LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
3239+
StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
3240+
auto F = LoadMMO->getFlags() &
3241+
~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
3242+
LoadMMO = MF->getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
3243+
Size, LoadMMO->getBaseAlign());
3244+
MachineMemOperand *StoreMMO =
3245+
MF->getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
3246+
sizeof(int32_t), Align(4));
3247+
3248+
MIB.setMemRefs({LoadMMO, StoreMMO});
3249+
3250+
MI.eraseFromParent();
3251+
return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
3252+
}
3253+
31523254
bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{
31533255
MI.setDesc(TII.get(MI.getOperand(1).getImm()));
31543256
MI.removeOperand(1);
@@ -3687,24 +3789,6 @@ AMDGPUInstructionSelector::selectScratchOffset(MachineOperand &Root) const {
36873789
}};
36883790
}
36893791

3690-
/// Match a zero extend from a 32-bit value to 64-bits.
3691-
static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) {
3692-
Register ZExtSrc;
3693-
if (mi_match(Reg, MRI, m_GZExt(m_Reg(ZExtSrc))))
3694-
return MRI.getType(ZExtSrc) == LLT::scalar(32) ? ZExtSrc : Register();
3695-
3696-
// Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 0)
3697-
const MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
3698-
if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES)
3699-
return false;
3700-
3701-
if (mi_match(Def->getOperand(2).getReg(), MRI, m_ZeroInt())) {
3702-
return Def->getOperand(1).getReg();
3703-
}
3704-
3705-
return Register();
3706-
}
3707-
37083792
// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset)
37093793
InstructionSelector::ComplexRendererFns
37103794
AMDGPUInstructionSelector::selectGlobalSAddr(MachineOperand &Root) const {

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
144144
bool selectGlobalAtomicFadd(MachineInstr &I, MachineOperand &AddrOp,
145145
MachineOperand &DataOp) const;
146146
bool selectBufferLoadLds(MachineInstr &MI) const;
147+
bool selectGlobalLoadLds(MachineInstr &MI) const;
147148
bool selectBVHIntrinsic(MachineInstr &I) const;
148149
bool selectSMFMACIntrin(MachineInstr &I) const;
149150
bool selectWaveAddress(MachineInstr &I) const;

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3026,6 +3026,11 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
30263026
constrainOpWithReadfirstlane(MI, MRI, 6); // soffset
30273027
return;
30283028
}
3029+
case Intrinsic::amdgcn_global_load_lds: {
3030+
applyDefaultMapping(OpdMapper);
3031+
constrainOpWithReadfirstlane(MI, MRI, 2);
3032+
return;
3033+
}
30293034
default: {
30303035
if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
30313036
AMDGPU::lookupRsrcIntrinsic(IntrID)) {
@@ -4517,6 +4522,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
45174522
OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
45184523
break;
45194524
}
4525+
case Intrinsic::amdgcn_global_load_lds: {
4526+
OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
4527+
OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
4528+
break;
4529+
}
45204530
default:
45214531
return getInvalidInstructionMapping();
45224532
}

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1318,6 +1318,14 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
13181318
Info.flags |= MachineMemOperand::MOStore;
13191319
return true;
13201320
}
1321+
case Intrinsic::amdgcn_global_load_lds: {
1322+
Info.opc = ISD::INTRINSIC_VOID;
1323+
unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1324+
Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1325+
Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
1326+
MachineMemOperand::MOVolatile;
1327+
return true;
1328+
}
13211329
default:
13221330
return false;
13231331
}
@@ -8318,6 +8326,81 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
83188326

83198327
return SDValue(Load, 0);
83208328
}
8329+
case Intrinsic::amdgcn_global_load_lds: {
8330+
unsigned Opc;
8331+
unsigned Size = Op->getConstantOperandVal(4);
8332+
switch (Size) {
8333+
default:
8334+
return SDValue();
8335+
case 1:
8336+
Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
8337+
break;
8338+
case 2:
8339+
Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
8340+
break;
8341+
case 4:
8342+
Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
8343+
break;
8344+
}
8345+
8346+
auto *M = cast<MemSDNode>(Op);
8347+
SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
8348+
8349+
SmallVector<SDValue, 6> Ops;
8350+
8351+
SDValue Addr = Op.getOperand(2); // Global ptr
8352+
SDValue VOffset;
8353+
// Try to split SAddr and VOffset. Global and LDS pointers share the same
8354+
// immediate offset, so we cannot use a regular SelectGlobalSAddr().
8355+
if (Addr->isDivergent() && Addr.getOpcode() == ISD::ADD) {
8356+
SDValue LHS = Addr.getOperand(0);
8357+
SDValue RHS = Addr.getOperand(1);
8358+
8359+
if (LHS->isDivergent())
8360+
std::swap(LHS, RHS);
8361+
8362+
if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
8363+
RHS.getOperand(0).getValueType() == MVT::i32) {
8364+
// add (i64 sgpr), (zero_extend (i32 vgpr))
8365+
Addr = LHS;
8366+
VOffset = RHS.getOperand(0);
8367+
}
8368+
}
8369+
8370+
Ops.push_back(Addr);
8371+
if (!Addr->isDivergent()) {
8372+
Opc = AMDGPU::getGlobalSaddrOp(Opc);
8373+
if (!VOffset)
8374+
VOffset = SDValue(
8375+
DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
8376+
DAG.getTargetConstant(0, DL, MVT::i32)), 0);
8377+
Ops.push_back(VOffset);
8378+
}
8379+
8380+
Ops.push_back(Op.getOperand(5)); // Offset
8381+
Ops.push_back(Op.getOperand(6)); // CPol
8382+
Ops.push_back(M0Val.getValue(0)); // Chain
8383+
Ops.push_back(M0Val.getValue(1)); // Glue
8384+
8385+
MachineMemOperand *LoadMMO = M->getMemOperand();
8386+
MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
8387+
LoadPtrI.Offset = Op->getConstantOperandVal(5);
8388+
MachinePointerInfo StorePtrI = LoadPtrI;
8389+
LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
8390+
StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
8391+
auto F = LoadMMO->getFlags() &
8392+
~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
8393+
LoadMMO = MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
8394+
Size, LoadMMO->getBaseAlign());
8395+
MachineMemOperand *StoreMMO =
8396+
MF.getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
8397+
sizeof(int32_t), Align(4));
8398+
8399+
auto Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
8400+
DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
8401+
8402+
return SDValue(Load, 0);
8403+
}
83218404
case Intrinsic::amdgcn_end_cf:
83228405
return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
83238406
Op->getOperand(2), Chain), 0);

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -435,6 +435,8 @@ bool SIInstrInfo::getMemOperandsWithOffsetWidth(
435435
DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
436436
if (DataOpIdx == -1)
437437
DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
438+
if (DataOpIdx == -1) // LDS DMA
439+
return false;
438440
Width = getOpSize(LdSt, DataOpIdx);
439441
return true;
440442
}

0 commit comments

Comments
 (0)