Skip to content

Commit

Permalink
[SLP]Improve minbitwidth analysis for operands of IToFP and ICmp inst…
Browse files Browse the repository at this point in the history
…ructions.

Compiler can improve analysis for operands of UIToFP/SIToFP instructions
and operands of ICmp instruction.

Reviewers: RKSimon

Reviewed By: RKSimon

Pull Request: llvm/llvm-project#85966
  • Loading branch information
alexey-bataev committed Apr 3, 2024
1 parent 3ee93f4 commit 42cbceb
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 16 deletions.
52 changes: 42 additions & 10 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1107,7 +1107,7 @@ class BoUpSLP {
MinBWs.clear();
ReductionBitWidth = 0;
CastMaxMinBWSizes.reset();
TruncNodes.clear();
ExtraBitWidthNodes.clear();
InstrElementSize.clear();
UserIgnoreList = nullptr;
PostponedGathers.clear();
Expand Down Expand Up @@ -3683,8 +3683,9 @@ class BoUpSLP {
/// type sizes, used in the tree.
std::optional<std::pair<unsigned, unsigned>> CastMaxMinBWSizes;

/// Indices of the vectorized trunc nodes.
DenseSet<unsigned> TruncNodes;
/// Indices of the vectorized nodes, which supposed to be the roots of the new
/// bitwidth analysis attempt, like trunc, IToFP or ICmp.
DenseSet<unsigned> ExtraBitWidthNodes;
};

} // end namespace slpvectorizer
Expand Down Expand Up @@ -6612,7 +6613,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
PrevMaxBW),
std::min<unsigned>(DL->getTypeSizeInBits(VL0->getType()),
PrevMinBW));
TruncNodes.insert(VectorizableTree.size());
ExtraBitWidthNodes.insert(VectorizableTree.size() + 1);
} else if (ShuffleOrOp == Instruction::SIToFP ||
ShuffleOrOp == Instruction::UIToFP) {
unsigned NumSignBits =
ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
if (auto *OpI = dyn_cast<Instruction>(VL0->getOperand(0))) {
APInt Mask = DB->getDemandedBits(OpI);
NumSignBits = std::max(NumSignBits, Mask.countl_zero());
}
if (NumSignBits * 2 >=
DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
ExtraBitWidthNodes.insert(VectorizableTree.size() + 1);
}
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies);
Expand Down Expand Up @@ -6660,6 +6672,18 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
TE->setOperand(1, Right);
buildTree_rec(Left, Depth + 1, {TE, 0});
buildTree_rec(Right, Depth + 1, {TE, 1});
if (ShuffleOrOp == Instruction::ICmp) {
unsigned NumSignBits0 =
ComputeNumSignBits(VL0->getOperand(0), *DL, 0, AC, nullptr, DT);
if (NumSignBits0 * 2 >=
DL->getTypeSizeInBits(VL0->getOperand(0)->getType()))
ExtraBitWidthNodes.insert(getOperandEntry(TE, 0)->Idx);
unsigned NumSignBits1 =
ComputeNumSignBits(VL0->getOperand(1), *DL, 0, AC, nullptr, DT);
if (NumSignBits1 * 2 >=
DL->getTypeSizeInBits(VL0->getOperand(1)->getType()))
ExtraBitWidthNodes.insert(getOperandEntry(TE, 1)->Idx);
}
return;
}
case Instruction::Select:
Expand Down Expand Up @@ -14302,7 +14326,8 @@ void BoUpSLP::computeMinimumValueSizes() {
bool IsStoreOrInsertElt =
VectorizableTree.front()->getOpcode() == Instruction::Store ||
VectorizableTree.front()->getOpcode() == Instruction::InsertElement;
if ((IsStoreOrInsertElt || UserIgnoreList) && TruncNodes.size() <= 1 &&
if ((IsStoreOrInsertElt || UserIgnoreList) &&
ExtraBitWidthNodes.size() <= 1 &&
(!CastMaxMinBWSizes || CastMaxMinBWSizes->second == 0 ||
CastMaxMinBWSizes->first / CastMaxMinBWSizes->second <= 2))
return;
Expand Down Expand Up @@ -14506,16 +14531,23 @@ void BoUpSLP::computeMinimumValueSizes() {
IsTopRoot = false;
IsProfitableToDemoteRoot = true;

if (TruncNodes.empty()) {
if (ExtraBitWidthNodes.empty()) {
NodeIdx = VectorizableTree.size();
} else {
unsigned NewIdx = 0;
do {
NewIdx = *TruncNodes.begin() + 1;
TruncNodes.erase(TruncNodes.begin());
} while (NewIdx <= NodeIdx && !TruncNodes.empty());
NewIdx = *ExtraBitWidthNodes.begin();
ExtraBitWidthNodes.erase(ExtraBitWidthNodes.begin());
} while (NewIdx <= NodeIdx && !ExtraBitWidthNodes.empty());
NodeIdx = NewIdx;
IsTruncRoot = true;
IsTruncRoot =
NodeIdx < VectorizableTree.size() &&
any_of(VectorizableTree[NodeIdx]->UserTreeIndices,
[](const EdgeInfo &EI) {
return EI.EdgeIdx == 0 &&
EI.UserTE->getOpcode() == Instruction::Trunc &&
!EI.UserTE->isAltShuffle();
});
}

// If the maximum bit width we compute is less than the with of the roots'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ define i1 @test(ptr noalias %0, i64 %1, ptr noalias %p, ptr %p1) {
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <2 x i24> [[TMP8]], <i24 24, i24 24>
; CHECK-NEXT: [[TMP10:%.*]] = select <2 x i1> [[TMP9]], <2 x i24> <i24 23, i24 23>, <2 x i24> [[TMP8]]
; CHECK-NEXT: [[TMP23:%.*]] = trunc <2 x i24> [[TMP10]] to <2 x i8>
; CHECK-NEXT: [[TMP11:%.*]] = zext <2 x i8> [[TMP23]] to <2 x i32>
; CHECK-NEXT: [[TMP12:%.*]] = and <2 x i32> [[TMP11]], <i32 254, i32 254>
; CHECK-NEXT: [[TMP26:%.*]] = zext <2 x i8> [[TMP23]] to <2 x i32>
; CHECK-NEXT: [[TMP12:%.*]] = and <2 x i32> [[TMP26]], <i32 254, i32 254>
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq <2 x i32> [[TMP12]], <i32 4, i32 4>
; CHECK-NEXT: [[TMP25:%.*]] = select <2 x i1> [[TMP13]], <2 x i8> <i8 2, i8 2>, <2 x i8> [[TMP23]]
; CHECK-NEXT: [[TMP14:%.*]] = zext <2 x i8> [[TMP25]] to <2 x i32>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,14 @@ define void @test() {
; CHECK-NEXT: [[TMP3:%.*]] = select i1 false, i32 0, i32 0
; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i8> <i8 poison, i8 0, i8 poison, i8 poison>, i8 [[TMP1]], i32 0
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 1>
; CHECK-NEXT: [[TMP6:%.*]] = sext <4 x i8> [[TMP5]] to <4 x i32>
; CHECK-NEXT: [[TMP15:%.*]] = trunc <4 x i8> [[TMP5]] to <4 x i1>
; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP8:%.*]] = or <4 x i8> [[TMP7]], zeroinitializer
; CHECK-NEXT: [[TMP9:%.*]] = sext <4 x i8> [[TMP8]] to <4 x i32>
; CHECK-NEXT: [[TMP10:%.*]] = or <4 x i32> zeroinitializer, [[TMP6]]
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq <4 x i32> [[TMP9]], [[TMP10]]
; CHECK-NEXT: [[TMP9:%.*]] = trunc <4 x i8> [[TMP8]] to <4 x i1>
; CHECK-NEXT: [[TMP10:%.*]] = or <4 x i1> zeroinitializer, [[TMP15]]
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq <4 x i1> [[TMP9]], [[TMP10]]
; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i1> [[TMP15]], <4 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[TMP6:%.*]] = zext <4 x i1> [[TMP16]] to <4 x i32>
; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> <i32 0, i32 0, i32 poison, i32 0>, <4 x i32> <i32 4, i32 5, i32 2, i32 7>
; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP13]])
Expand Down

0 comments on commit 42cbceb

Please sign in to comment.