Skip to content

[AMDGPU] Simplify and improve codegen for llvm.amdgcn.set.inactive #107889

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Sep 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 10 additions & 4 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5439,6 +5439,8 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,

bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
IID == Intrinsic::amdgcn_permlanex16;
bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
IID == Intrinsic::amdgcn_set_inactive_chain_arg;

auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
Register Src2, LLT VT) -> Register {
Expand All @@ -5448,6 +5450,8 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
case Intrinsic::amdgcn_permlane64:
return LaneOp.getReg(0);
case Intrinsic::amdgcn_readlane:
case Intrinsic::amdgcn_set_inactive:
case Intrinsic::amdgcn_set_inactive_chain_arg:
return LaneOp.addUse(Src1).getReg(0);
case Intrinsic::amdgcn_writelane:
return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
Expand All @@ -5472,7 +5476,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
Register Src0 = MI.getOperand(2).getReg();
Register Src1, Src2;
if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
IsPermLane16) {
IsSetInactive || IsPermLane16) {
Src1 = MI.getOperand(3).getReg();
if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
Src2 = MI.getOperand(4).getReg();
Expand All @@ -5490,7 +5494,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
if (Size < 32) {
Src0 = B.buildAnyExt(S32, Src0).getReg(0);

if (IsPermLane16)
if (IsSetInactive || IsPermLane16)
Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0);

if (IID == Intrinsic::amdgcn_writelane)
Expand Down Expand Up @@ -5526,7 +5530,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
MachineInstrBuilder Src1Parts, Src2Parts;

if (IsPermLane16)
if (IsSetInactive || IsPermLane16)
Src1Parts = B.buildUnmerge(PartialResTy, Src1);

if (IID == Intrinsic::amdgcn_writelane)
Expand All @@ -5535,7 +5539,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
for (unsigned i = 0; i < NumParts; ++i) {
Src0 = Src0Parts.getReg(i);

if (IsPermLane16)
if (IsSetInactive || IsPermLane16)
Src1 = Src1Parts.getReg(i);

if (IID == Intrinsic::amdgcn_writelane)
Expand Down Expand Up @@ -7496,6 +7500,8 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_permlane16:
case Intrinsic::amdgcn_permlanex16:
case Intrinsic::amdgcn_permlane64:
case Intrinsic::amdgcn_set_inactive:
case Intrinsic::amdgcn_set_inactive_chain_arg:
return legalizeLaneOp(Helper, MI, IntrID);
case Intrinsic::amdgcn_s_buffer_prefetch_data:
return legalizeSBufferPrefetch(Helper, MI);
Expand Down
16 changes: 11 additions & 5 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6102,6 +6102,8 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
unsigned IID = N->getConstantOperandVal(0);
bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
IID == Intrinsic::amdgcn_permlanex16;
bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
IID == Intrinsic::amdgcn_set_inactive_chain_arg;
SDLoc SL(N);
MVT IntVT = MVT::getIntegerVT(ValSize);

Expand All @@ -6119,6 +6121,8 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
Operands.push_back(Src2);
[[fallthrough]];
case Intrinsic::amdgcn_readlane:
case Intrinsic::amdgcn_set_inactive:
case Intrinsic::amdgcn_set_inactive_chain_arg:
Operands.push_back(Src1);
[[fallthrough]];
case Intrinsic::amdgcn_readfirstlane:
Expand All @@ -6145,7 +6149,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
SDValue Src0 = N->getOperand(1);
SDValue Src1, Src2;
if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
IsPermLane16) {
IsSetInactive || IsPermLane16) {
Src1 = N->getOperand(2);
if (IID == Intrinsic::amdgcn_writelane || IsPermLane16)
Src2 = N->getOperand(3);
Expand All @@ -6161,7 +6165,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
SL, MVT::i32);

if (IsPermLane16) {
if (IsSetInactive || IsPermLane16) {
Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
SL, MVT::i32);
}
Expand Down Expand Up @@ -6237,7 +6241,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
DAG.getConstant(EltIdx, SL, MVT::i32));

if (IsPermLane16)
if (IsSetInactive || IsPermLane16)
Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
DAG.getConstant(EltIdx, SL, MVT::i32));

Expand All @@ -6246,7 +6250,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
DAG.getConstant(EltIdx, SL, MVT::i32));

Pieces.push_back(
IsPermLane16
IsSetInactive || IsPermLane16
? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
: createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
EltIdx += 2;
Expand All @@ -6262,7 +6266,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
MVT VecVT = MVT::getVectorVT(MVT::i32, ValSize / 32);
Src0 = DAG.getBitcast(VecVT, Src0);

if (IsPermLane16)
if (IsSetInactive || IsPermLane16)
Src1 = DAG.getBitcast(VecVT, Src1);

if (IID == Intrinsic::amdgcn_writelane)
Expand Down Expand Up @@ -8745,6 +8749,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::amdgcn_permlane16:
case Intrinsic::amdgcn_permlanex16:
case Intrinsic::amdgcn_permlane64:
case Intrinsic::amdgcn_set_inactive:
case Intrinsic::amdgcn_set_inactive_chain_arg:
return lowerLaneOp(*this, Op.getNode(), DAG);
default:
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
Expand Down
163 changes: 8 additions & 155 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2097,21 +2097,6 @@ unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) {
}
}

Register SIInstrInfo::findSetInactiveMask(const MachineInstr &MI) {
assert(MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64);
for (auto &Op : MI.implicit_operands()) {
if (Op.isDef())
continue;
Register OpReg = Op.getReg();
if (OpReg == AMDGPU::EXEC || OpReg == AMDGPU::EXEC_LO ||
OpReg == AMDGPU::SCC)
continue;
return OpReg;
}
return Register();
}

bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MBB.findDebugLoc(MI);
Expand Down Expand Up @@ -2286,147 +2271,15 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.eraseFromParent();
break;
}
case AMDGPU::V_SET_INACTIVE_B32:
case AMDGPU::V_SET_INACTIVE_B64: {
unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
unsigned VMovOpc = MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64
? AMDGPU::V_MOV_B64_PSEUDO
: AMDGPU::V_MOV_B32_e32;
Register ExecReg = RI.getExec();
case AMDGPU::V_SET_INACTIVE_B32: {
// Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.
Register DstReg = MI.getOperand(0).getReg();
MachineOperand &ActiveSrc = MI.getOperand(1);
MachineOperand &InactiveSrc = MI.getOperand(2);

// Find implicit register defining lanes active outside WWM.
Register ExecSrcReg = findSetInactiveMask(MI);
assert(ExecSrcReg && "V_SET_INACTIVE must be in known WWM region");
// Note: default here is set to ExecReg so that functional MIR is still
// generated if implicit def is not found and assertions are disabled.
if (!ExecSrcReg)
ExecSrcReg = ExecReg;

// Ideally in WWM this operation is lowered to V_CNDMASK; however,
// constant bus constraints and the presence of literal constants
// present an issue.
// Fallback to V_MOV base lowering in all but the common cases.
const bool VMov64 = VMovOpc != AMDGPU::V_MOV_B32_e32;
MachineFunction *MF = MBB.getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
const unsigned Opcode = AMDGPU::V_CNDMASK_B32_e64;
const MCInstrDesc &Desc = get(Opcode);

const APInt ActiveImm(64, ActiveSrc.isImm() ? ActiveSrc.getImm() : 0);
const APInt InactiveImm(64, InactiveSrc.isImm() ? InactiveSrc.getImm() : 0);
const APInt ActiveImmLo(32, ActiveImm.getLoBits(32).getZExtValue());
const APInt ActiveImmHi(32, ActiveImm.getHiBits(32).getZExtValue());
const APInt InactiveImmLo(32, InactiveImm.getLoBits(32).getZExtValue());
const APInt InactiveImmHi(32, InactiveImm.getHiBits(32).getZExtValue());

int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);

int ConstantBusLimit = ST.getConstantBusLimit(AMDGPU::V_CNDMASK_B32_e64);
int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
int ConstantBusUses =
1 + // Starts at 1 for ExecSrcReg
(usesConstantBus(MRI, ActiveSrc, Desc.operands()[Src1Idx]) ? 1 : 0) +
(usesConstantBus(MRI, InactiveSrc, Desc.operands()[Src0Idx]) ? 1 : 0);
int LiteralConstants =
((ActiveSrc.isReg() ||
(ActiveSrc.isImm() && isInlineConstant(ActiveImm)))
? 0
: 1) +
((InactiveSrc.isReg() ||
(InactiveSrc.isImm() && isInlineConstant(InactiveImm)))
? 0
: 1);

bool UseVCndMask =
ConstantBusUses <= ConstantBusLimit && LiteralConstants <= LiteralLimit;
if (VMov64 && UseVCndMask) {
// Decomposition must not introduce new literals.
UseVCndMask &=
ActiveSrc.isReg() ||
(isInlineConstant(ActiveImmLo) && isInlineConstant(ActiveImmHi)) ||
(!isInlineConstant(ActiveImm));
UseVCndMask &= InactiveSrc.isReg() ||
(isInlineConstant(InactiveImmLo) &&
isInlineConstant(InactiveImmHi)) ||
(!isInlineConstant(InactiveImm));
}

if (UseVCndMask && VMov64) {
// Dual V_CNDMASK_B32
MachineOperand ActiveLo = buildExtractSubRegOrImm(
MI, MRI, ActiveSrc, nullptr, AMDGPU::sub0, nullptr);
MachineOperand ActiveHi = buildExtractSubRegOrImm(
MI, MRI, ActiveSrc, nullptr, AMDGPU::sub1, nullptr);
MachineOperand InactiveLo = buildExtractSubRegOrImm(
MI, MRI, InactiveSrc, nullptr, AMDGPU::sub0, nullptr);
MachineOperand InactiveHi = buildExtractSubRegOrImm(
MI, MRI, InactiveSrc, nullptr, AMDGPU::sub1, nullptr);
if (ActiveSrc.isReg())
ActiveHi.setIsKill(ActiveSrc.isKill());
if (InactiveSrc.isReg())
InactiveHi.setIsKill(InactiveSrc.isKill());
BuildMI(MBB, MI, DL, Desc, RI.getSubReg(DstReg, AMDGPU::sub0))
.addImm(0)
.add(InactiveLo)
.addImm(0)
.add(ActiveLo)
.addReg(ExecSrcReg)
.addReg(DstReg, RegState::ImplicitDefine);
BuildMI(MBB, MI, DL, Desc, RI.getSubReg(DstReg, AMDGPU::sub1))
.addImm(0)
.add(InactiveHi)
.addImm(0)
.add(ActiveHi)
.addReg(ExecSrcReg)
.addReg(DstReg, RegState::ImplicitDefine);
} else if (UseVCndMask) {
// Single V_CNDMASK_B32
BuildMI(MBB, MI, DL, Desc, DstReg)
.addImm(0)
.add(InactiveSrc)
.addImm(0)
.add(ActiveSrc)
.addReg(ExecSrcReg);
} else {
// Fallback V_MOV case.
// Avoid unnecessary work if a source VGPR is also the destination.
// This can happen if WWM register allocation was efficient.
// Note: this assumes WWM execution.
bool DstIsActive = ActiveSrc.isReg() && ActiveSrc.getReg() == DstReg;
bool DstIsInactive =
InactiveSrc.isReg() && InactiveSrc.getReg() == DstReg;
if (!DstIsInactive) {
// Set exec mask to inactive lanes,
// but only if active lanes would be overwritten.
if (DstIsActive) {
BuildMI(MBB, MI, DL, get(NotOpc), ExecReg)
.addReg(ExecSrcReg)
.setOperandDead(3); // Dead scc
}
// Copy inactive lanes
MachineInstr *VMov =
BuildMI(MBB, MI, DL, get(VMovOpc), DstReg).add(InactiveSrc);
if (VMov64)
expandPostRAPseudo(*VMov);
}
if (!DstIsActive) {
// Set exec mask to active lanes
BuildMI(MBB, MI, DL, get(MovOpc), ExecReg).addReg(ExecSrcReg);
// Copy active lanes
MachineInstr *VMov =
BuildMI(MBB, MI, DL, get(VMovOpc), MI.getOperand(0).getReg())
.add(ActiveSrc);
if (VMov64)
expandPostRAPseudo(*VMov);
}
// Restore WWM
BuildMI(MBB, MI, DL, get(MovOpc), ExecReg).addImm(-1);
}
BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This would be even simpler if we had MachineInstr::swapOperands(), and even more simpler if we swapped the order of the operands in the definition of V_SET_INACTIVE_B32 to match V_CNDMASK_B32 - but I'll leave that for a future cleanup.

.add(MI.getOperand(3))
.add(MI.getOperand(4))
.add(MI.getOperand(1))
.add(MI.getOperand(2))
.add(MI.getOperand(5));
MI.eraseFromParent();
break;
}
Expand Down
2 changes: 0 additions & 2 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -1437,8 +1437,6 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
// This is used if an operand is a 32 bit register but needs to be aligned
// regardless.
void enforceOperandRCAlignment(MachineInstr &MI, unsigned OpName) const;

static Register findSetInactiveMask(const MachineInstr &MI);
};

/// \brief Returns true if a reg:subreg pair P has a TRC class
Expand Down
21 changes: 4 additions & 17 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -243,29 +243,16 @@ def : GCNPat <(f32 (fptrunc_round f64:$src0, (i32 SupportedRoundMode:$round))),

// Invert the exec mask and overwrite the inactive lanes of dst with inactive,
// restoring it after we're done.
let Defs = [SCC], isConvergent = 1 in {
def V_SET_INACTIVE_B32 : VPseudoInstSI <(outs VGPR_32:$vdst),
(ins VSrc_b32: $src, VSrc_b32:$inactive), []>;

def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst),
(ins VSrc_b64: $src, VSrc_b64:$inactive), []>;
} // End Defs = [SCC]
let isConvergent = 1 in
def V_SET_INACTIVE_B32 : VOP3_Pseudo<"v_set_inactive_b32", VOP2e_I32_I32_I32_I1>;

foreach vt = Reg32Types.types in {
def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)),
(V_SET_INACTIVE_B32 VSrc_b32:$src, VSrc_b32:$inactive)>;
}

foreach vt = Reg64Types.types in {
def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)),
(V_SET_INACTIVE_B64 VSrc_b64:$src, VSrc_b64:$inactive)>;
(V_SET_INACTIVE_B32 0, VSrc_b32:$src, 0, VSrc_b32:$inactive, (IMPLICIT_DEF))>;
}

def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)),
(V_SET_INACTIVE_B32 VGPR_32:$src, VGPR_32:$inactive)>;

def : GCNPat<(i64 (int_amdgcn_set_inactive_chain_arg i64:$src, i64:$inactive)),
(V_SET_INACTIVE_B64 VReg_64:$src, VReg_64:$inactive)>;
(V_SET_INACTIVE_B32 0, VGPR_32:$src, 0, VGPR_32:$inactive, (IMPLICIT_DEF))>;

let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
def WAVE_REDUCE_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
Expand Down
3 changes: 1 addition & 2 deletions llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -215,8 +215,7 @@ bool SIPreAllocateWWMRegs::runOnMachineFunction(MachineFunction &MF) {
for (MachineBasicBlock *MBB : RPOT) {
bool InWWM = false;
for (MachineInstr &MI : *MBB) {
if (MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64)
if (MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B32)
RegsAssigned |= processDef(MI.getOperand(0));

if (MI.getOpcode() == AMDGPU::SI_SPILL_S32_TO_VGPR) {
Expand Down
Loading
Loading