Skip to content

Commit 1aa8a6f

Browse files
authored
[VPlan] Compute cost for most opcodes in VPWidenRecipe (NFCI). (#98764)
Implement VPWidenRecipe::computeCost for most cases (except UDiv,SDiv,URem,SRem which require additional logic). Note that this specializes `::computeCost` instead of `::cost`, as `VPRecipeBase::cost` is responsible for skipping cost-computations for pre-computed recipes for now. The most recent version of the VPlan-based cost model introduction has been committed on Jul 10 (b841e2e) and we should probably give it at least a week in case additional mismatches surface. PR: #98764
1 parent cf721e2 commit 1aa8a6f

File tree

3 files changed

+93
-10
lines changed

3 files changed

+93
-10
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4354,8 +4354,8 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks(
43544354
SmallVector<RecipeVFPair> InvalidCosts;
43554355
for (const auto &Plan : VPlans) {
43564356
for (ElementCount VF : Plan->vectorFactors()) {
4357-
VPCostContext CostCtx(CM.TTI, Legal->getWidestInductionType(), LLVMCtx,
4358-
CM);
4357+
VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(),
4358+
LLVMCtx, CM);
43594359
auto Iter = vp_depth_first_deep(Plan->getVectorLoopRegion()->getEntry());
43604360
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
43614361
for (auto &R : *VPBB) {
@@ -7062,7 +7062,8 @@ InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
70627062
ElementCount VF) const {
70637063
InstructionCost Cost = 0;
70647064
LLVMContext &LLVMCtx = OrigLoop->getHeader()->getContext();
7065-
VPCostContext CostCtx(CM.TTI, Legal->getWidestInductionType(), LLVMCtx, CM);
7065+
VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(),
7066+
LLVMCtx, CM);
70667067

70677068
// Cost modeling for inductions is inaccurate in the legacy cost model
70687069
// compared to the recipes that are generated. To match here initially during

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -736,14 +736,16 @@ class VPLiveOut : public VPUser {
736736
/// Struct to hold various analysis needed for cost computations.
737737
struct VPCostContext {
738738
const TargetTransformInfo &TTI;
739+
const TargetLibraryInfo &TLI;
739740
VPTypeAnalysis Types;
740741
LLVMContext &LLVMCtx;
741742
LoopVectorizationCostModel &CM;
742743
SmallPtrSet<Instruction *, 8> SkipCostComputation;
743744

744-
VPCostContext(const TargetTransformInfo &TTI, Type *CanIVTy,
745-
LLVMContext &LLVMCtx, LoopVectorizationCostModel &CM)
746-
: TTI(TTI), Types(CanIVTy, LLVMCtx), LLVMCtx(LLVMCtx), CM(CM) {}
745+
VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI,
746+
Type *CanIVTy, LLVMContext &LLVMCtx,
747+
LoopVectorizationCostModel &CM)
748+
: TTI(TTI), TLI(TLI), Types(CanIVTy, LLVMCtx), LLVMCtx(LLVMCtx), CM(CM) {}
747749

748750
/// Return the cost for \p UI with \p VF using the legacy cost model as
749751
/// fallback until computing the cost of all recipes migrates to VPlan.
@@ -796,7 +798,7 @@ class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
796798
/// Return the cost of this recipe, taking into account if the cost
797799
/// computation should be skipped and the ForceTargetInstructionCost flag.
798800
/// Also takes care of printing the cost for debugging.
799-
virtual InstructionCost cost(ElementCount VF, VPCostContext &Ctx);
801+
InstructionCost cost(ElementCount VF, VPCostContext &Ctx);
800802

801803
/// Insert an unlinked recipe into a basic block immediately before
802804
/// the specified recipe.
@@ -860,9 +862,11 @@ class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
860862
DebugLoc getDebugLoc() const { return DL; }
861863

862864
protected:
863-
/// Compute the cost of this recipe using the legacy cost model and the
864-
/// underlying instructions.
865-
InstructionCost computeCost(ElementCount VF, VPCostContext &Ctx) const;
865+
/// Compute the cost of this recipe either using a recipe's specialized
866+
/// implementation or using the legacy cost model and the underlying
867+
/// instructions.
868+
virtual InstructionCost computeCost(ElementCount VF,
869+
VPCostContext &Ctx) const;
866870
};
867871

868872
// Helper macro to define common classof implementations for recipes.
@@ -1426,6 +1430,10 @@ class VPWidenRecipe : public VPRecipeWithIRFlags {
14261430
/// processing State.VF elements.
14271431
void execute(VPTransformState &State) override;
14281432

1433+
/// Return the cost of this VPWidenRecipe.
1434+
InstructionCost computeCost(ElementCount VF,
1435+
VPCostContext &Ctx) const override;
1436+
14291437
unsigned getOpcode() const { return Opcode; }
14301438

14311439
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1140,6 +1140,80 @@ void VPWidenRecipe::execute(VPTransformState &State) {
11401140
#endif
11411141
}
11421142

1143+
InstructionCost VPWidenRecipe::computeCost(ElementCount VF,
1144+
VPCostContext &Ctx) const {
1145+
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1146+
switch (Opcode) {
1147+
case Instruction::FNeg: {
1148+
Type *VectorTy =
1149+
ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
1150+
return Ctx.TTI.getArithmeticInstrCost(
1151+
Opcode, VectorTy, CostKind,
1152+
{TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
1153+
{TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None});
1154+
}
1155+
1156+
case Instruction::UDiv:
1157+
case Instruction::SDiv:
1158+
case Instruction::SRem:
1159+
case Instruction::URem:
1160+
// More complex computation, let the legacy cost-model handle this for now.
1161+
return Ctx.getLegacyCost(cast<Instruction>(getUnderlyingValue()), VF);
1162+
case Instruction::Add:
1163+
case Instruction::FAdd:
1164+
case Instruction::Sub:
1165+
case Instruction::FSub:
1166+
case Instruction::Mul:
1167+
case Instruction::FMul:
1168+
case Instruction::FDiv:
1169+
case Instruction::FRem:
1170+
case Instruction::Shl:
1171+
case Instruction::LShr:
1172+
case Instruction::AShr:
1173+
case Instruction::And:
1174+
case Instruction::Or:
1175+
case Instruction::Xor: {
1176+
VPValue *RHS = getOperand(1);
1177+
// Certain instructions can be cheaper to vectorize if they have a constant
1178+
// second vector operand. One example of this are shifts on x86.
1179+
TargetTransformInfo::OperandValueInfo RHSInfo = {
1180+
TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None};
1181+
if (RHS->isLiveIn())
1182+
RHSInfo = Ctx.TTI.getOperandInfo(RHS->getLiveInIRValue());
1183+
1184+
if (RHSInfo.Kind == TargetTransformInfo::OK_AnyValue &&
1185+
getOperand(1)->isDefinedOutsideVectorRegions())
1186+
RHSInfo.Kind = TargetTransformInfo::OK_UniformValue;
1187+
Type *VectorTy =
1188+
ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
1189+
Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
1190+
1191+
SmallVector<const Value *, 4> Operands;
1192+
if (CtxI)
1193+
Operands.append(CtxI->value_op_begin(), CtxI->value_op_end());
1194+
return Ctx.TTI.getArithmeticInstrCost(
1195+
Opcode, VectorTy, CostKind,
1196+
{TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
1197+
RHSInfo, Operands, CtxI, &Ctx.TLI);
1198+
}
1199+
case Instruction::Freeze: {
1200+
// This opcode is unknown. Assume that it is the same as 'mul'.
1201+
Type *VectorTy =
1202+
ToVectorTy(Ctx.Types.inferScalarType(this->getVPSingleValue()), VF);
1203+
return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
1204+
}
1205+
case Instruction::ICmp:
1206+
case Instruction::FCmp: {
1207+
Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
1208+
Type *VectorTy = ToVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
1209+
return Ctx.TTI.getCmpSelInstrCost(Opcode, VectorTy, nullptr, getPredicate(),
1210+
CostKind, CtxI);
1211+
}
1212+
default:
1213+
llvm_unreachable("Unsupported opcode for instruction");
1214+
}
1215+
}
1216+
11431217
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
11441218
void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
11451219
VPSlotTracker &SlotTracker) const {

0 commit comments

Comments
 (0)