Skip to content

[AMDGPU] Eliminate likely-spurious execz checks via intrinsic argument #123749

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -3409,11 +3409,11 @@ def int_amdgcn_ashr_pk_u8_i32 : ClangBuiltin<"__builtin_amdgcn_ashr_pk_u8_i32">,
// having side effects, which is sufficient to prevent optimizations without
// having to mark them as convergent.
def int_amdgcn_if : Intrinsic<[llvm_i1_ty, llvm_anyint_ty],
[llvm_i1_ty], [IntrWillReturn, IntrNoCallback, IntrNoFree]
[llvm_i1_ty, llvm_i1_ty], [ImmArg<ArgIndex<1>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
>;

def int_amdgcn_else : Intrinsic<[llvm_i1_ty, llvm_anyint_ty],
[llvm_anyint_ty], [IntrWillReturn, IntrNoCallback, IntrNoFree]
[llvm_anyint_ty, llvm_i1_ty], [ImmArg<ArgIndex<1>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
>;

def int_amdgcn_if_break : Intrinsic<[llvm_anyint_ty],
Expand Down
8 changes: 4 additions & 4 deletions llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,12 @@ def AMDGPUFmasOp : SDTypeProfile<1, 4,
def ImmOp : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
def AMDGPUKillSDT : SDTypeProfile<0, 1, [SDTCisInt<0>]>;

def AMDGPUIfOp : SDTypeProfile<1, 2,
[SDTCisVT<0, i1>, SDTCisVT<1, i1>, SDTCisVT<2, OtherVT>]
def AMDGPUIfOp : SDTypeProfile<1, 3,
[SDTCisVT<0, i1>, SDTCisVT<1, i1>, SDTCisVT<2, i1>, SDTCisVT<3, OtherVT>]
>;

def AMDGPUElseOp : SDTypeProfile<1, 2,
[SDTCisVT<0, i1>, SDTCisVT<1, i1>, SDTCisVT<2, OtherVT>]
def AMDGPUElseOp : SDTypeProfile<1, 3,
[SDTCisVT<0, i1>, SDTCisVT<1, i1>, SDTCisVT<2, i1>, SDTCisVT<3, OtherVT>]
>;

def AMDGPULoopOp : SDTypeProfile<0, 2,
Expand Down
9 changes: 6 additions & 3 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7264,6 +7264,7 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,

Register Def = MI.getOperand(1).getReg();
Register Use = MI.getOperand(3).getReg();
auto LikelyDivergent = MI.getOperand(4).getImm();

MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();

Expand All @@ -7273,13 +7274,15 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
B.setInsertPt(B.getMBB(), BrCond->getIterator());
if (IntrID == Intrinsic::amdgcn_if) {
B.buildInstr(AMDGPU::SI_IF)
.addDef(Def)
.addUse(Use)
.addMBB(UncondBrTarget);
.addDef(Def)
.addUse(Use)
.addImm(LikelyDivergent)
.addMBB(UncondBrTarget);
} else {
B.buildInstr(AMDGPU::SI_ELSE)
.addDef(Def)
.addUse(Use)
.addImm(LikelyDivergent)
.addMBB(UncondBrTarget);
}

Expand Down
108 changes: 104 additions & 4 deletions llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Local.h"
Expand All @@ -36,6 +37,25 @@ namespace {
using StackEntry = std::pair<BasicBlock *, Value *>;
using StackVector = SmallVector<StackEntry, 16>;

class DynamicDivergenceHeuristic {
public:
DynamicDivergenceHeuristic(const Function &F, const GCNSubtarget &ST) {
IsSingleLaneExecution = ST.isSingleLaneExecution(F);
}

/// Check if \p V is likely to be have dynamically diverging values among the
/// workitems in each wavefront.
bool isLikelyDivergent(const Value *V);

private:
bool IsSingleLaneExecution = false;

bool isWorkitemID(const Value *V) const;

DenseSet<const Value *> Visited;
ValueMap<const Value *, bool> LikelyDivergentCache;
};

class SIAnnotateControlFlow {
private:
Function *F;
Expand All @@ -62,6 +82,8 @@ class SIAnnotateControlFlow {

LoopInfo *LI;

DynamicDivergenceHeuristic DivergenceHeuristic;

void initialize(const GCNSubtarget &ST);

bool isUniform(BranchInst *T);
Expand Down Expand Up @@ -99,7 +121,7 @@ class SIAnnotateControlFlow {
public:
SIAnnotateControlFlow(Function &F, const GCNSubtarget &ST, DominatorTree &DT,
LoopInfo &LI, UniformityInfo &UA)
: F(&F), UA(&UA), DT(&DT), LI(&LI) {
: F(&F), UA(&UA), DT(&DT), LI(&LI), DivergenceHeuristic(F, ST) {
initialize(ST);
}

Expand Down Expand Up @@ -186,9 +208,15 @@ bool SIAnnotateControlFlow::openIf(BranchInst *Term) {
if (isUniform(Term))
return false;

// Check if it's likely that at least one lane will always follow the
// then-branch, i.e., the then-branch is never skipped completly.
Value *IsLikelyDivergent =
DivergenceHeuristic.isLikelyDivergent(Term->getCondition()) ? BoolTrue
: BoolFalse;

IRBuilder<> IRB(Term);
Value *IfCall = IRB.CreateCall(getDecl(If, Intrinsic::amdgcn_if, IntMask),
{Term->getCondition()});
{Term->getCondition(), IsLikelyDivergent});
Value *Cond = IRB.CreateExtractValue(IfCall, {0});
Value *Mask = IRB.CreateExtractValue(IfCall, {1});
Term->setCondition(Cond);
Expand All @@ -202,9 +230,17 @@ bool SIAnnotateControlFlow::insertElse(BranchInst *Term) {
return false;
}

Value *IncomingMask = popSaved();
// Check if it's likely that at least one lane will always follow the
// else-branch, i.e., the else-branch is never skipped completly.
Value *IsLikelyDivergent = DivergenceHeuristic.isLikelyDivergent(IncomingMask)
? BoolTrue
: BoolFalse;

IRBuilder<> IRB(Term);
Value *ElseCall = IRB.CreateCall(
getDecl(Else, Intrinsic::amdgcn_else, {IntMask, IntMask}), {popSaved()});
Value *ElseCall =
IRB.CreateCall(getDecl(Else, Intrinsic::amdgcn_else, {IntMask, IntMask}),
{IncomingMask, IsLikelyDivergent});
Value *Cond = IRB.CreateExtractValue(ElseCall, {0});
Value *Mask = IRB.CreateExtractValue(ElseCall, {1});
Term->setCondition(Cond);
Expand Down Expand Up @@ -385,6 +421,70 @@ bool SIAnnotateControlFlow::run() {
return Changed;
}

bool DynamicDivergenceHeuristic::isWorkitemID(const Value *V) const {
auto *II = dyn_cast<IntrinsicInst>(V);
if (!II)
return false;

switch (II->getIntrinsicID()) {
case Intrinsic::amdgcn_workitem_id_z:
case Intrinsic::r600_read_tidig_z:
case Intrinsic::amdgcn_workitem_id_y:
case Intrinsic::r600_read_tidig_y:
case Intrinsic::amdgcn_workitem_id_x:
case Intrinsic::r600_read_tidig_x:
case Intrinsic::amdgcn_mbcnt_hi:
case Intrinsic::amdgcn_mbcnt_lo:
return true;
default:
return false;
}
}

bool DynamicDivergenceHeuristic::isLikelyDivergent(const Value *V) {
if (IsSingleLaneExecution)
return false;

if (isWorkitemID(V))
return true;

auto *I = dyn_cast<Instruction>(V);
if (!I)
return false;

// Floating-point computations tend to be too complex to judge if they are
// likely divergent.
if (I->getType()->isFloatingPointTy())
return false;

// ExtractValueInst and IntrinsicInst enable looking through the
// amdgcn_if/else intrinsics inserted by SIAnnotateControlFlow.
if (!isa<BinaryOperator>(I) && !isa<UnaryOperator>(I) && !isa<CastInst>(I) &&
!isa<CmpInst>(I) && !isa<ExtractValueInst>(I) && !isa<IntrinsicInst>(I) &&
!isa<PHINode>(I) && !isa<SelectInst>(I))
return false;

// Have we already checked V?
auto CacheEntry = LikelyDivergentCache.find(V);
if (CacheEntry != LikelyDivergentCache.end())
return CacheEntry->second;

// Have we hit a cycle?
if (!Visited.insert(V).second)
return false;

// Does it use a likely varying Value?
bool Result = false;
for (const auto &Use : I->operands()) {
Result |= isLikelyDivergent(Use);
if (Result)
break;
}

LikelyDivergentCache.insert({V, Result});
return Result;
}

PreservedAnalyses SIAnnotateControlFlowPass::run(Function &F,
FunctionAnalysisManager &FAM) {
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
Expand Down
10 changes: 5 additions & 5 deletions llvm/lib/Target/AMDGPU/SIInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -417,8 +417,8 @@ def IGLP_OPT : SPseudoInstSI<(outs), (ins i32imm:$mask),
let isTerminator = 1, isNotDuplicable = 1 in {

def SI_IF: CFPseudoInstSI <
(outs SReg_1:$dst), (ins SReg_1:$vcc, brtarget:$target),
[(set i1:$dst, (AMDGPUif i1:$vcc, bb:$target))], 1, 1> {
(outs SReg_1:$dst), (ins SReg_1:$vcc, i1imm:$likelydivergent, brtarget:$target),
[(set i1:$dst, (AMDGPUif i1:$vcc, (i1 timm:$likelydivergent), bb:$target))], 1, 1> {
let Constraints = "";
let Size = 12;
let hasSideEffects = 1;
Expand All @@ -427,7 +427,7 @@ def SI_IF: CFPseudoInstSI <

def SI_ELSE : CFPseudoInstSI <
(outs SReg_1:$dst),
(ins SReg_1:$src, brtarget:$target), [], 1, 1> {
(ins SReg_1:$src, i1imm:$likelydivergent, brtarget:$target), [], 1, 1> {
let Size = 12;
let hasSideEffects = 1;
let IsNeverUniform = 1;
Expand Down Expand Up @@ -1049,8 +1049,8 @@ def : GCNPat<
>;

def : GCNPat<
(AMDGPUelse i1:$src, bb:$target),
(SI_ELSE $src, $target)
(AMDGPUelse i1:$src, i1:$likelydivergent, bb:$target),
(SI_ELSE $src, $likelydivergent, $target)
>;

def : Pat <
Expand Down
36 changes: 33 additions & 3 deletions llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,11 @@
#include "llvm/ADT/SmallSet.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/LiveVariables.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/Support/BranchProbability.h"
#include "llvm/Target/TargetMachine.h"

using namespace llvm;
Expand Down Expand Up @@ -213,6 +216,21 @@ static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI) {
return true;
}

/// Mark Succ as an unlikely successor of MBB.
static void setSuccessorUnlikely(MachineBasicBlock &MBB,
const MachineBasicBlock *Succ) {
auto **E = MBB.succ_end();
bool Found = false;
for (auto **SI = MBB.succ_begin(); SI != E; ++SI) {
if (*SI == Succ) {
MBB.setSuccProbability(SI, BranchProbability::getZero());
Found = true;
}
}
assert(Found && "Succ must be a successor of MBB!");
MBB.normalizeSuccProbs();
}

void SILowerControlFlow::emitIf(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
Expand All @@ -221,9 +239,11 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
MachineOperand& Cond = MI.getOperand(1);
assert(Cond.getSubReg() == AMDGPU::NoSubRegister);

MachineOperand &ImpDefSCC = MI.getOperand(4);
MachineOperand &ImpDefSCC = MI.getOperand(5);
assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef());

bool LikelyDivergent = MI.getOperand(2).getImm();

// If there is only one use of save exec register and that use is SI_END_CF,
// we can optimize SI_IF by returning the full saved exec mask instead of
// just cleared bits.
Expand Down Expand Up @@ -281,7 +301,12 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
// Insert the S_CBRANCH_EXECZ instruction which will be optimized later
// during SIPreEmitPeephole.
MachineInstr *NewBr = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
.add(MI.getOperand(2));
.add(MI.getOperand(3));

if (LikelyDivergent) {
MachineBasicBlock *ExeczDest = MI.getOperand(3).getMBB();
setSuccessorUnlikely(MBB, ExeczDest);
}

if (!LIS) {
MI.eraseFromParent();
Expand Down Expand Up @@ -329,7 +354,9 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
if (LV)
LV->replaceKillInstruction(SrcReg, MI, *OrSaveExec);

MachineBasicBlock *DestBB = MI.getOperand(2).getMBB();
bool LikelyDivergent = MI.getOperand(2).getImm();

MachineBasicBlock *DestBB = MI.getOperand(3).getMBB();

MachineBasicBlock::iterator ElsePt(MI);

Expand All @@ -352,6 +379,9 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
.addMBB(DestBB);

if (LikelyDivergent)
setSuccessorUnlikely(MBB, DestBB);

if (!LIS) {
MI.eraseFromParent();
return;
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ MachineBasicBlock *
SIOptimizeVGPRLiveRange::getElseTarget(MachineBasicBlock *MBB) const {
for (auto &BR : MBB->terminators()) {
if (BR.getOpcode() == AMDGPU::SI_ELSE)
return BR.getOperand(2).getMBB();
return BR.getOperand(3).getMBB();
}
return nullptr;
}
Expand Down Expand Up @@ -682,7 +682,7 @@ bool SIOptimizeVGPRLiveRange::run(MachineFunction &MF) {
for (auto &MI : MBB.terminators()) {
// Detect the if-else blocks
if (MI.getOpcode() == AMDGPU::SI_IF) {
MachineBasicBlock *IfTarget = MI.getOperand(2).getMBB();
MachineBasicBlock *IfTarget = MI.getOperand(3).getMBB();
auto *Endif = getElseTarget(IfTarget);
if (!Endif)
continue;
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/TargetSchedule.h"
#include "llvm/Support/BranchProbability.h"
Expand All @@ -22,6 +23,8 @@ using namespace llvm;

#define DEBUG_TYPE "si-pre-emit-peephole"

STATISTIC(NumCBranchExeczElim, "Number of s_cbranch_execz eliminated.");

namespace {

class SIPreEmitPeephole : public MachineFunctionPass {
Expand Down Expand Up @@ -404,6 +407,7 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
return false;

LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI);
++NumCBranchExeczElim;
MI.eraseFromParent();
SrcMBB.removeSuccessor(TrueMBB);

Expand Down
Loading