-
Notifications
You must be signed in to change notification settings - Fork 14k
[LV, VP]VP intrinsics support for the Loop Vectorizer + adding new tail-folding mode using EVL. #76172
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[LV, VP]VP intrinsics support for the Loop Vectorizer + adding new tail-folding mode using EVL. #76172
Changes from all commits
b3c9f6b
e677862
26b944c
ce9cc43
0611c30
6914229
75af7ad
7568798
e320aa2
96b3db5
d000d5a
d476fe1
7c7cc1c
1532c85
08809ca
4d1622d
635a893
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -124,6 +124,7 @@ | |
#include "llvm/IR/User.h" | ||
#include "llvm/IR/Value.h" | ||
#include "llvm/IR/ValueHandle.h" | ||
#include "llvm/IR/VectorBuilder.h" | ||
#include "llvm/IR/Verifier.h" | ||
#include "llvm/Support/Casting.h" | ||
#include "llvm/Support/CommandLine.h" | ||
|
@@ -248,10 +249,12 @@ static cl::opt<TailFoldingStyle> ForceTailFoldingStyle( | |
clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control", | ||
"Create lane mask using active.lane.mask intrinsic, and use " | ||
"it for both data and control flow"), | ||
clEnumValN( | ||
TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck, | ||
"data-and-control-without-rt-check", | ||
"Similar to data-and-control, but remove the runtime check"))); | ||
clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck, | ||
"data-and-control-without-rt-check", | ||
"Similar to data-and-control, but remove the runtime check"), | ||
clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl", | ||
"Use predicated EVL instructions for tail folding. If EVL " | ||
"is unsupported, fallback to data-without-lane-mask."))); | ||
|
||
static cl::opt<bool> MaximizeBandwidth( | ||
"vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, | ||
|
@@ -1505,29 +1508,62 @@ class LoopVectorizationCostModel { | |
|
||
/// Returns the TailFoldingStyle that is best for the current loop. | ||
TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const { | ||
return IVUpdateMayOverflow ? ChosenTailFoldingStyle.first | ||
: ChosenTailFoldingStyle.second; | ||
if (!ChosenTailFoldingStyle) | ||
return TailFoldingStyle::None; | ||
return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first | ||
: ChosenTailFoldingStyle->second; | ||
} | ||
|
||
/// Selects and saves TailFoldingStyle for 2 options - if IV update may | ||
/// overflow or not. | ||
void setTailFoldingStyles() { | ||
assert(ChosenTailFoldingStyle.first == TailFoldingStyle::None && | ||
ChosenTailFoldingStyle.second == TailFoldingStyle::None && | ||
"Tail folding must not be selected yet."); | ||
if (!Legal->prepareToFoldTailByMasking()) | ||
/// \param IsScalableVF true if scalable vector factors enabled. | ||
/// \param UserIC User specific interleave count. | ||
void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) { | ||
assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet."); | ||
if (!Legal->prepareToFoldTailByMasking()) { | ||
ChosenTailFoldingStyle = | ||
std::make_pair(TailFoldingStyle::None, TailFoldingStyle::None); | ||
return; | ||
} | ||
|
||
if (ForceTailFoldingStyle.getNumOccurrences()) { | ||
ChosenTailFoldingStyle.first = ChosenTailFoldingStyle.second = | ||
ForceTailFoldingStyle; | ||
if (!ForceTailFoldingStyle.getNumOccurrences()) { | ||
ChosenTailFoldingStyle = std::make_pair( | ||
TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true), | ||
TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false)); | ||
return; | ||
} | ||
|
||
ChosenTailFoldingStyle.first = | ||
TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true); | ||
ChosenTailFoldingStyle.second = | ||
TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false); | ||
// Set styles when forced. | ||
ChosenTailFoldingStyle = std::make_pair(ForceTailFoldingStyle.getValue(), | ||
ForceTailFoldingStyle.getValue()); | ||
if (ForceTailFoldingStyle != TailFoldingStyle::DataWithEVL) | ||
return; | ||
// Override forced styles if needed. | ||
// FIXME: use actual opcode/data type for analysis here. | ||
// FIXME: Investigate opportunity for fixed vector factor. | ||
bool EVLIsLegal = | ||
IsScalableVF && UserIC <= 1 && | ||
TTI.hasActiveVectorLength(0, nullptr, Align()) && | ||
!EnableVPlanNativePath && | ||
// FIXME: implement support for max safe dependency distance. | ||
Legal->isSafeForAnyVectorWidth() && | ||
// FIXME: remove this once reductions are supported. | ||
Legal->getReductionVars().empty(); | ||
if (!EVLIsLegal) { | ||
// If for some reason EVL mode is unsupported, fallback to | ||
// DataWithoutLaneMask to try to vectorize the loop with folded tail | ||
// in a generic way. | ||
ChosenTailFoldingStyle = | ||
std::make_pair(TailFoldingStyle::DataWithoutLaneMask, | ||
TailFoldingStyle::DataWithoutLaneMask); | ||
LLVM_DEBUG( | ||
dbgs() | ||
<< "LV: Preference for VP intrinsics indicated. Will " | ||
"not try to generate VP Intrinsics " | ||
<< (UserIC > 1 | ||
? "since interleave count specified is greater than 1.\n" | ||
: "due to non-interleaving reasons.\n")); | ||
} | ||
} | ||
|
||
/// Returns true if all loop blocks should be masked to fold tail loop. | ||
|
@@ -1544,6 +1580,18 @@ class LoopVectorizationCostModel { | |
return foldTailByMasking() || Legal->blockNeedsPredication(BB); | ||
} | ||
|
||
/// Returns true if VP intrinsics with explicit vector length support should | ||
/// be generated in the tail folded loop. | ||
bool foldTailWithEVL() const { | ||
return getTailFoldingStyle() == TailFoldingStyle::DataWithEVL && | ||
// FIXME: remove this once vp_reverse is supported. | ||
none_of( | ||
WideningDecisions, | ||
[](const std::pair<std::pair<Instruction *, ElementCount>, | ||
std::pair<InstWidening, InstructionCost>> | ||
&Data) { return Data.second.first == CM_Widen_Reverse; }); | ||
} | ||
|
||
/// Returns true if the Phi is part of an inloop reduction. | ||
bool isInLoopReduction(PHINode *Phi) const { | ||
return InLoopReductions.contains(Phi); | ||
|
@@ -1688,8 +1736,8 @@ class LoopVectorizationCostModel { | |
|
||
/// Control finally chosen tail folding style. The first element is used if | ||
/// the IV update may overflow, the second element - if it does not. | ||
std::pair<TailFoldingStyle, TailFoldingStyle> ChosenTailFoldingStyle = | ||
std::make_pair(TailFoldingStyle::None, TailFoldingStyle::None); | ||
std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>> | ||
ChosenTailFoldingStyle; | ||
|
||
/// A map holding scalar costs for different vectorization factors. The | ||
/// presence of a cost for an instruction in the mapping indicates that the | ||
|
@@ -4647,9 +4695,24 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) { | |
// found modulo the vectorization factor is not zero, try to fold the tail | ||
// by masking. | ||
// FIXME: look for a smaller MaxVF that does divide TC rather than masking. | ||
setTailFoldingStyles(); | ||
if (foldTailByMasking()) | ||
setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC); | ||
if (foldTailByMasking()) { | ||
if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) { | ||
LLVM_DEBUG( | ||
dbgs() | ||
<< "LV: tail is folded with EVL, forcing unroll factor to be 1. Will " | ||
"try to generate VP Intrinsics with scalable vector " | ||
"factors only.\n"); | ||
// Tail folded loop using VP intrinsics restricts the VF to be scalable | ||
// for now. | ||
// TODO: extend it for fixed vectors, if required. | ||
assert(MaxFactors.ScalableVF.isScalable() && | ||
"Expected scalable vector factor."); | ||
|
||
MaxFactors.FixedVF = ElementCount::getFixed(1); | ||
} | ||
return MaxFactors; | ||
} | ||
|
||
// If there was a tail-folding hint/switch, but we can't fold the tail by | ||
// masking, fallback to a vectorization with a scalar epilogue. | ||
|
@@ -5257,6 +5320,13 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, | |
if (!isScalarEpilogueAllowed()) | ||
return 1; | ||
|
||
// Do not interleave if EVL is preferred and no User IC is specified. | ||
alexey-bataev marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if (foldTailWithEVL()) { | ||
LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. " | ||
alexey-bataev marked this conversation as resolved.
Show resolved
Hide resolved
|
||
"Unroll factor forced to be 1.\n"); | ||
return 1; | ||
} | ||
|
||
// We used the distance for the interleave count. | ||
if (!Legal->isSafeForAnyVectorWidth()) | ||
return 1; | ||
|
@@ -8487,6 +8557,9 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, | |
VPlanTransforms::truncateToMinimalBitwidths( | ||
*Plan, CM.getMinimalBitwidths(), PSE.getSE()->getContext()); | ||
VPlanTransforms::optimize(*Plan, *PSE.getSE()); | ||
// TODO: try to put it close to addActiveLaneMask(). | ||
if (CM.foldTailWithEVL()) | ||
VPlanTransforms::addExplicitVectorLength(*Plan); | ||
alexey-bataev marked this conversation as resolved.
Show resolved
Hide resolved
|
||
assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid"); | ||
VPlans.push_back(std::move(Plan)); | ||
} | ||
|
@@ -9179,7 +9252,7 @@ void VPDerivedIVRecipe::execute(VPTransformState &State) { | |
State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags()); | ||
|
||
Value *Step = State.get(getStepValue(), VPIteration(0, 0)); | ||
Value *CanonicalIV = State.get(getCanonicalIV(), VPIteration(0, 0)); | ||
Value *CanonicalIV = State.get(getOperand(1), VPIteration(0, 0)); | ||
Value *DerivedIV = emitTransformedIndex( | ||
State.Builder, CanonicalIV, getStartValue()->getLiveInIRValue(), Step, | ||
Kind, cast_if_present<BinaryOperator>(FPBinOp)); | ||
|
@@ -9307,6 +9380,52 @@ void VPReplicateRecipe::execute(VPTransformState &State) { | |
State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State); | ||
} | ||
|
||
/// Creates either vp_store or vp_scatter intrinsics calls to represent | ||
/// predicated store/scatter. | ||
static Instruction * | ||
lowerStoreUsingVectorIntrinsics(IRBuilderBase &Builder, Value *Addr, | ||
Value *StoredVal, bool IsScatter, Value *Mask, | ||
Value *EVL, const Align &Alignment) { | ||
CallInst *Call; | ||
if (IsScatter) { | ||
Call = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()), | ||
Intrinsic::vp_scatter, | ||
{StoredVal, Addr, Mask, EVL}); | ||
} else { | ||
VectorBuilder VBuilder(Builder); | ||
VBuilder.setEVL(EVL).setMask(Mask); | ||
Call = cast<CallInst>(VBuilder.createVectorInstruction( | ||
Instruction::Store, Type::getVoidTy(EVL->getContext()), | ||
{StoredVal, Addr})); | ||
} | ||
Call->addParamAttr( | ||
1, Attribute::getWithAlignment(Call->getContext(), Alignment)); | ||
return Call; | ||
} | ||
|
||
/// Creates either vp_load or vp_gather intrinsics calls to represent | ||
/// predicated load/gather. | ||
static Instruction *lowerLoadUsingVectorIntrinsics(IRBuilderBase &Builder, | ||
VectorType *DataTy, | ||
Value *Addr, bool IsGather, | ||
Value *Mask, Value *EVL, | ||
const Align &Alignment) { | ||
CallInst *Call; | ||
if (IsGather) { | ||
Call = | ||
Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL}, | ||
nullptr, "wide.masked.gather"); | ||
} else { | ||
VectorBuilder VBuilder(Builder); | ||
VBuilder.setEVL(EVL).setMask(Mask); | ||
Call = cast<CallInst>(VBuilder.createVectorInstruction( | ||
Instruction::Load, DataTy, Addr, "vp.op.load")); | ||
} | ||
Call->addParamAttr( | ||
0, Attribute::getWithAlignment(Call->getContext(), Alignment)); | ||
return Call; | ||
} | ||
|
||
void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { | ||
VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; | ||
|
||
|
@@ -9345,7 +9464,25 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { | |
for (unsigned Part = 0; Part < State.UF; ++Part) { | ||
Instruction *NewSI = nullptr; | ||
Value *StoredVal = State.get(StoredValue, Part); | ||
if (CreateGatherScatter) { | ||
// TODO: split this into several classes for better design. | ||
if (State.EVL) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Still accessing EVL (as per https://reviews.llvm.org/D99750?id=558054#inline-1551722). To move forward and make progress, I think this would suggest to leave as is now, with a TODO to model this explicitly in the recipe. I am planning on splitting up the memory recipes soon now that address generation has been moved. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Need to check that EVL is accessible, cannot remove it now There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a plan to remove it later? This complication of a recipe's execute() goes against VPlan's refactoring guideline, and complicates the abovementioned plan to split up these overly complex recipes. (See "Simplify VPlan execution" in https://llvm.org/devmtg/2023-10/slides/techtalks/Hahn-VPlan-StatusUpdateAndRoadmap.pdf#page=32) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Above was raised earlier in
Reason explained above: execute() of recipes should be straightforward. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Do you want me to spend another year right now discussing the design of this new class? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As mentioned earlier, I think we are in agreement that the recipe should be split up (also independent of EVL support). It's something I am planning to get around to do sometime soon, but there are a number of other changes I would like to wrap up first. So I don't think we need to block the current patch on the refactoring, but leave a TODO to split up the recipe in general. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks @fhahn for answering the "Is there a plan to remove it later?" question, constructively! The excepts from D99750 (w/o response there, sigh) were provided as context. |
||
assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with " | ||
"explicit vector length."); | ||
assert(cast<VPInstruction>(State.EVL)->getOpcode() == | ||
VPInstruction::ExplicitVectorLength && | ||
"EVL must be VPInstruction::ExplicitVectorLength."); | ||
Value *EVL = State.get(State.EVL, VPIteration(0, 0)); | ||
// If EVL is not nullptr, then EVL must be a valid value set during plan | ||
// creation, possibly default value = whole vector register length. EVL | ||
// is created only if TTI prefers predicated vectorization, thus if EVL | ||
// is not nullptr it also implies preference for predicated | ||
// vectorization. | ||
// FIXME: Support reverse store after vp_reverse is added. | ||
Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this exercised or currently dead code? Additional occurrences below. It treats all-true masks as unmasked, i.e., an absent mask operand passes a null MaskPart, and should be used as argued below. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For consecutive non-masked loads it returns nullptr, VectorBuilder then handles it and generates all-true mask. |
||
NewSI = lowerStoreUsingVectorIntrinsics( | ||
Builder, State.get(getAddr(), Part, !CreateGatherScatter), | ||
StoredVal, CreateGatherScatter, MaskPart, EVL, Alignment); | ||
} else if (CreateGatherScatter) { | ||
Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; | ||
Value *VectorGep = State.get(getAddr(), Part); | ||
NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, | ||
|
@@ -9375,7 +9512,25 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { | |
State.setDebugLocFrom(getDebugLoc()); | ||
for (unsigned Part = 0; Part < State.UF; ++Part) { | ||
Value *NewLI; | ||
if (CreateGatherScatter) { | ||
// TODO: split this into several classes for better design. | ||
if (State.EVL) { | ||
assert(State.UF == 1 && "Expected only UF == 1 when vectorizing with " | ||
"explicit vector length."); | ||
assert(cast<VPInstruction>(State.EVL)->getOpcode() == | ||
VPInstruction::ExplicitVectorLength && | ||
"EVL must be VPInstruction::ExplicitVectorLength."); | ||
Value *EVL = State.get(State.EVL, VPIteration(0, 0)); | ||
alexey-bataev marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// If EVL is not nullptr, then EVL must be a valid value set during plan | ||
// creation, possibly default value = whole vector register length. EVL | ||
// is created only if TTI prefers predicated vectorization, thus if EVL | ||
// is not nullptr it also implies preference for predicated | ||
// vectorization. | ||
// FIXME: Support reverse loading after vp_reverse is added. | ||
Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; | ||
NewLI = lowerLoadUsingVectorIntrinsics( | ||
Builder, DataTy, State.get(getAddr(), Part, !CreateGatherScatter), | ||
CreateGatherScatter, MaskPart, EVL, Alignment); | ||
} else if (CreateGatherScatter) { | ||
Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; | ||
Value *VectorGep = State.get(getAddr(), Part); | ||
NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, | ||
|
Uh oh!
There was an error while loading. Please reload this page.