@@ -7056,19 +7056,16 @@ bool BoUpSLP::areAllUsersVectorized(
7056
7056
7057
7057
static std::pair<InstructionCost, InstructionCost>
7058
7058
getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
7059
- TargetTransformInfo *TTI, TargetLibraryInfo *TLI) {
7059
+ TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
7060
+ ArrayRef<Type *> ArgTys) {
7060
7061
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7061
7062
7062
7063
// Calculate the cost of the scalar and vector calls.
7063
- SmallVector<Type *, 4> VecTys;
7064
- for (Use &Arg : CI->args())
7065
- VecTys.push_back(
7066
- FixedVectorType::get(Arg->getType(), VecTy->getNumElements()));
7067
7064
FastMathFlags FMF;
7068
7065
if (auto *FPCI = dyn_cast<FPMathOperator>(CI))
7069
7066
FMF = FPCI->getFastMathFlags();
7070
7067
SmallVector<const Value *> Arguments(CI->args());
7071
- IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, VecTys , FMF,
7068
+ IntrinsicCostAttributes CostAttrs(ID, VecTy, Arguments, ArgTys , FMF,
7072
7069
dyn_cast<IntrinsicInst>(CI));
7073
7070
auto IntrinsicCost =
7074
7071
TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
@@ -7081,8 +7078,8 @@ getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
7081
7078
if (!CI->isNoBuiltin() && VecFunc) {
7082
7079
// Calculate the cost of the vector library call.
7083
7080
// If the corresponding vector call is cheaper, return its cost.
7084
- LibCost = TTI->getCallInstrCost(nullptr, VecTy, VecTys,
7085
- TTI::TCK_RecipThroughput);
7081
+ LibCost =
7082
+ TTI->getCallInstrCost(nullptr, VecTy, ArgTys, TTI::TCK_RecipThroughput);
7086
7083
}
7087
7084
return {IntrinsicCost, LibCost};
7088
7085
}
@@ -8508,6 +8505,30 @@ TTI::CastContextHint BoUpSLP::getCastContextHint(const TreeEntry &TE) const {
8508
8505
return TTI::CastContextHint::None;
8509
8506
}
8510
8507
8508
+ /// Builds the arguments types vector for the given call instruction with the
8509
+ /// given \p ID for the specified vector factor.
8510
+ static SmallVector<Type *> buildIntrinsicArgTypes(const CallInst *CI,
8511
+ const Intrinsic::ID ID,
8512
+ const unsigned VF,
8513
+ unsigned MinBW) {
8514
+ SmallVector<Type *> ArgTys;
8515
+ for (auto [Idx, Arg] : enumerate(CI->args())) {
8516
+ if (ID != Intrinsic::not_intrinsic) {
8517
+ if (isVectorIntrinsicWithScalarOpAtArg(ID, Idx)) {
8518
+ ArgTys.push_back(Arg->getType());
8519
+ continue;
8520
+ }
8521
+ if (MinBW > 0) {
8522
+ ArgTys.push_back(FixedVectorType::get(
8523
+ IntegerType::get(CI->getContext(), MinBW), VF));
8524
+ continue;
8525
+ }
8526
+ }
8527
+ ArgTys.push_back(FixedVectorType::get(Arg->getType(), VF));
8528
+ }
8529
+ return ArgTys;
8530
+ }
8531
+
8511
8532
InstructionCost
8512
8533
BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
8513
8534
SmallPtrSetImpl<Value *> &CheckedExtracts) {
@@ -9074,7 +9095,11 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
9074
9095
};
9075
9096
auto GetVectorCost = [=](InstructionCost CommonCost) {
9076
9097
auto *CI = cast<CallInst>(VL0);
9077
- auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);
9098
+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
9099
+ SmallVector<Type *> ArgTys =
9100
+ buildIntrinsicArgTypes(CI, ID, VecTy->getNumElements(),
9101
+ It != MinBWs.end() ? It->second.first : 0);
9102
+ auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
9078
9103
return std::min(VecCallCosts.first, VecCallCosts.second) + CommonCost;
9079
9104
};
9080
9105
return GetCostDiff(GetScalarCost, GetVectorCost);
@@ -12548,7 +12573,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
12548
12573
12549
12574
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
12550
12575
12551
- auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);
12576
+ SmallVector<Type *> ArgTys =
12577
+ buildIntrinsicArgTypes(CI, ID, VecTy->getNumElements(),
12578
+ It != MinBWs.end() ? It->second.first : 0);
12579
+ auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI, ArgTys);
12552
12580
bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
12553
12581
VecCallCosts.first <= VecCallCosts.second;
12554
12582
@@ -12557,16 +12585,20 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
12557
12585
SmallVector<Type *, 2> TysForDecl;
12558
12586
// Add return type if intrinsic is overloaded on it.
12559
12587
if (UseIntrinsic && isVectorIntrinsicWithOverloadTypeAtArg(ID, -1))
12560
- TysForDecl.push_back(
12561
- FixedVectorType::get(CI->getType(), E->Scalars.size()));
12588
+ TysForDecl.push_back(VecTy);
12562
12589
auto *CEI = cast<CallInst>(VL0);
12563
12590
for (unsigned I : seq<unsigned>(0, CI->arg_size())) {
12564
12591
ValueList OpVL;
12565
12592
// Some intrinsics have scalar arguments. This argument should not be
12566
12593
// vectorized.
12567
12594
if (UseIntrinsic && isVectorIntrinsicWithScalarOpAtArg(ID, I)) {
12568
12595
ScalarArg = CEI->getArgOperand(I);
12569
- OpVecs.push_back(CEI->getArgOperand(I));
12596
+ // if decided to reduce bitwidth of abs intrinsic, it second argument
12597
+ // must be set false (do not return poison, if value issigned min).
12598
+ if (ID == Intrinsic::abs && It != MinBWs.end() &&
12599
+ It->second.first < DL->getTypeSizeInBits(CEI->getType()))
12600
+ ScalarArg = Builder.getFalse();
12601
+ OpVecs.push_back(ScalarArg);
12570
12602
if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I))
12571
12603
TysForDecl.push_back(ScalarArg->getType());
12572
12604
continue;
@@ -12579,10 +12611,13 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
12579
12611
}
12580
12612
ScalarArg = CEI->getArgOperand(I);
12581
12613
if (cast<VectorType>(OpVec->getType())->getElementType() !=
12582
- ScalarArg->getType()) {
12614
+ ScalarArg->getType() &&
12615
+ It == MinBWs.end()) {
12583
12616
auto *CastTy = FixedVectorType::get(ScalarArg->getType(),
12584
12617
VecTy->getNumElements());
12585
12618
OpVec = Builder.CreateIntCast(OpVec, CastTy, GetOperandSignedness(I));
12619
+ } else if (It != MinBWs.end()) {
12620
+ OpVec = Builder.CreateIntCast(OpVec, VecTy, GetOperandSignedness(I));
12586
12621
}
12587
12622
LLVM_DEBUG(dbgs() << "SLP: OpVec[" << I << "]: " << *OpVec << "\n");
12588
12623
OpVecs.push_back(OpVec);
@@ -14326,6 +14361,62 @@ bool BoUpSLP::collectValuesToDemote(
14326
14361
return TryProcessInstruction(I, *ITE, BitWidth, Ops);
14327
14362
}
14328
14363
14364
+ case Instruction::Call: {
14365
+ auto *IC = dyn_cast<IntrinsicInst>(I);
14366
+ if (!IC)
14367
+ break;
14368
+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(IC, TLI);
14369
+ if (ID != Intrinsic::abs && ID != Intrinsic::smin &&
14370
+ ID != Intrinsic::smax && ID != Intrinsic::umin && ID != Intrinsic::umax)
14371
+ break;
14372
+ SmallVector<Value *> Operands(1, I->getOperand(0));
14373
+ function_ref<bool(unsigned, unsigned)> CallChecker;
14374
+ auto CompChecker = [&](unsigned BitWidth, unsigned OrigBitWidth) {
14375
+ assert(BitWidth <= OrigBitWidth && "Unexpected bitwidths!");
14376
+ if (ID == Intrinsic::umin || ID == Intrinsic::umax) {
14377
+ APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
14378
+ return MaskedValueIsZero(I->getOperand(0), Mask, SimplifyQuery(*DL)) &&
14379
+ MaskedValueIsZero(I->getOperand(1), Mask, SimplifyQuery(*DL));
14380
+ }
14381
+ assert((ID == Intrinsic::smin || ID == Intrinsic::smax) &&
14382
+ "Expected min/max intrinsics only.");
14383
+ unsigned SignBits = OrigBitWidth - BitWidth;
14384
+ return SignBits <= ComputeNumSignBits(I->getOperand(0), *DL, 0, AC,
14385
+ nullptr, DT) &&
14386
+ SignBits <=
14387
+ ComputeNumSignBits(I->getOperand(1), *DL, 0, AC, nullptr, DT);
14388
+ };
14389
+ End = 1;
14390
+ if (ID != Intrinsic::abs) {
14391
+ Operands.push_back(I->getOperand(1));
14392
+ End = 2;
14393
+ CallChecker = CompChecker;
14394
+ }
14395
+ InstructionCost BestCost =
14396
+ std::numeric_limits<InstructionCost::CostType>::max();
14397
+ unsigned BestBitWidth = BitWidth;
14398
+ unsigned VF = ITE->Scalars.size();
14399
+ // Choose the best bitwidth based on cost estimations.
14400
+ auto Checker = [&](unsigned BitWidth, unsigned) {
14401
+ unsigned MinBW = PowerOf2Ceil(BitWidth);
14402
+ SmallVector<Type *> ArgTys = buildIntrinsicArgTypes(IC, ID, VF, MinBW);
14403
+ auto VecCallCosts = getVectorCallCosts(
14404
+ IC,
14405
+ FixedVectorType::get(IntegerType::get(IC->getContext(), MinBW), VF),
14406
+ TTI, TLI, ArgTys);
14407
+ InstructionCost Cost = std::min(VecCallCosts.first, VecCallCosts.second);
14408
+ if (Cost < BestCost) {
14409
+ BestCost = Cost;
14410
+ BestBitWidth = BitWidth;
14411
+ }
14412
+ return false;
14413
+ };
14414
+ [[maybe_unused]] bool NeedToExit;
14415
+ (void)AttemptCheckBitwidth(Checker, NeedToExit);
14416
+ BitWidth = BestBitWidth;
14417
+ return TryProcessInstruction(I, *ITE, BitWidth, Operands, CallChecker);
14418
+ }
14419
+
14329
14420
// Otherwise, conservatively give up.
14330
14421
default:
14331
14422
break;
0 commit comments