-
Notifications
You must be signed in to change notification settings - Fork 13.3k
[LV]Enable non-power-of-2 store-load forwarding distance in predicated DataWithEVL vectorization mode #100755
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
Created using spr 1.3.5
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-risc-v Author: Alexey Bataev (alexey-bataev) ChangesDataWithEVL tail folded loops still use scalable vectorization with Patch is 59.15 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/100755.diff 16 Files Affected:
diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index cc40d2e83f2e0..b661e117d01ee 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -37,6 +37,8 @@ class Value;
struct VectorizerParams {
/// Maximum SIMD width.
static const unsigned MaxVectorWidth;
+ /// Maximum LMUL factor.
+ static const unsigned MaxVectorLMUL;
/// VF as overridden by the user.
static unsigned VectorizationFactor;
@@ -222,6 +224,23 @@ class MemoryDepChecker {
return MaxSafeVectorWidthInBits;
}
+ /// Return safe power-of-2 number of elements, which do not prevent store-load
+ /// forwarding.
+ std::optional<uint64_t> getStoreLoadForwardSafeVFPowerOf2() const {
+ if (MaxStoreLoadForwardSafeVF.first == std::numeric_limits<uint64_t>::max())
+ return std::nullopt;
+ return MaxStoreLoadForwardSafeVF.first;
+ }
+
+ /// Return safe non-power-of-2 number of elements, which do not prevent
+ /// store-load forwarding.
+ std::optional<uint64_t> getStoreLoadForwardSafeVFNonPowerOf2() const {
+ if (MaxStoreLoadForwardSafeVF.second ==
+ std::numeric_limits<uint64_t>::max())
+ return std::nullopt;
+ return MaxStoreLoadForwardSafeVF.second;
+ }
+
/// In same cases when the dependency check fails we can still
/// vectorize the loop with a dynamic array access check.
bool shouldRetryWithRuntimeCheck() const {
@@ -310,6 +329,12 @@ class MemoryDepChecker {
/// restrictive.
uint64_t MaxSafeVectorWidthInBits = -1U;
+ /// Maximum number of elements (power-of-2 and non-power-of-2), which do not
+ /// prevent store-load forwarding.
+ std::pair<uint64_t, uint64_t> MaxStoreLoadForwardSafeVF =
+ std::make_pair(std::numeric_limits<uint64_t>::max(),
+ std::numeric_limits<uint64_t>::max());
+
/// If we see a non-constant dependence distance we can still try to
/// vectorize this loop with runtime checks.
bool FoundNonConstantDistanceDependence = false;
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index 0f4d1355dd2bf..c16a5f9a1344c 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -377,6 +377,18 @@ class LoopVectorizationLegality {
return LAI->getDepChecker().getMaxSafeVectorWidthInBits();
}
+ /// Return safe power-of-2 number of elements, which do not prevent store-load
+ /// forwarding.
+ std::optional<unsigned> getMaxStoreLoadForwardSafeVFPowerOf2() const {
+ return LAI->getDepChecker().getStoreLoadForwardSafeVFPowerOf2();
+ }
+
+ /// Return safe non-power-of-2 number of elements, which do not prevent
+ /// store-load forwarding.
+ std::optional<unsigned> getMaxStoreLoadForwardSafeVFNonPowerOf2() const {
+ return LAI->getDepChecker().getStoreLoadForwardSafeVFNonPowerOf2();
+ }
+
/// Returns true if vector representation of the instruction \p I
/// requires mask.
bool isMaskRequired(const Instruction *I) const {
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 646d2f7ef3077..29816bd1d845c 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -100,6 +100,8 @@ static cl::opt<unsigned> MemoryCheckMergeThreshold(
/// Maximum SIMD width.
const unsigned VectorizerParams::MaxVectorWidth = 64;
+/// Maximum LMUL factor.
+const unsigned VectorizerParams::MaxVectorLMUL = 8;
/// We collect dependences up to this threshold.
static cl::opt<unsigned>
@@ -1764,31 +1766,64 @@ bool MemoryDepChecker::couldPreventStoreLoadForward(uint64_t Distance,
// cause any slowdowns.
const uint64_t NumItersForStoreLoadThroughMemory = 8 * TypeByteSize;
// Maximum vector factor.
- uint64_t MaxVFWithoutSLForwardIssues = std::min(
- VectorizerParams::MaxVectorWidth * TypeByteSize, MinDepDistBytes);
+ uint64_t MaxVFWithoutSLForwardIssuesPowerOf2 =
+ std::min(VectorizerParams::MaxVectorWidth * TypeByteSize,
+ MaxStoreLoadForwardSafeVF.first);
+ uint64_t MaxVFWithoutSLForwardIssuesNonPowerOf2 =
+ std::min(VectorizerParams::MaxVectorLMUL *
+ VectorizerParams::MaxVectorWidth * TypeByteSize,
+ MaxStoreLoadForwardSafeVF.second);
// Compute the smallest VF at which the store and load would be misaligned.
- for (uint64_t VF = 2 * TypeByteSize; VF <= MaxVFWithoutSLForwardIssues;
- VF *= 2) {
+ for (uint64_t VF = 2 * TypeByteSize;
+ VF <= MaxVFWithoutSLForwardIssuesPowerOf2; VF *= 2) {
// If the number of vector iteration between the store and the load are
// small we could incur conflicts.
if (Distance % VF && Distance / VF < NumItersForStoreLoadThroughMemory) {
- MaxVFWithoutSLForwardIssues = (VF >> 1);
+ MaxVFWithoutSLForwardIssuesPowerOf2 = (VF >> 1);
+ break;
+ }
+ }
+ // RISCV VLA supports non-power-2 vector factor. So, we iterate in a
+ // backward order to find largest VF, which allows aligned stores-loads or
+ // the number of iterations between conflicting memory addresses is not less
+ // than 8 (NumItersForStoreLoadThroughMemory).
+ for (uint64_t VF = MaxVFWithoutSLForwardIssuesNonPowerOf2,
+ E = 2 * TypeByteSize;
+ VF >= E; VF -= TypeByteSize) {
+ if (Distance % VF == 0 ||
+ Distance / VF >= NumItersForStoreLoadThroughMemory) {
+ uint64_t GCD = MaxStoreLoadForwardSafeVF.second ==
+ std::numeric_limits<uint64_t>::max()
+ ? VF
+ : std::gcd(MaxStoreLoadForwardSafeVF.second, VF);
+ MaxVFWithoutSLForwardIssuesNonPowerOf2 = GCD;
break;
}
}
- if (MaxVFWithoutSLForwardIssues < 2 * TypeByteSize) {
+ if (MaxVFWithoutSLForwardIssuesPowerOf2 < 2 * TypeByteSize &&
+ MaxVFWithoutSLForwardIssuesNonPowerOf2 < 2 * TypeByteSize) {
LLVM_DEBUG(
dbgs() << "LAA: Distance " << Distance
<< " that could cause a store-load forwarding conflict\n");
return true;
}
- if (MaxVFWithoutSLForwardIssues < MinDepDistBytes &&
- MaxVFWithoutSLForwardIssues !=
- VectorizerParams::MaxVectorWidth * TypeByteSize)
- MinDepDistBytes = MaxVFWithoutSLForwardIssues;
+ if (MaxVFWithoutSLForwardIssuesPowerOf2 < 2 * TypeByteSize)
+ MaxStoreLoadForwardSafeVF.first = 1;
+ else if (MaxVFWithoutSLForwardIssuesPowerOf2 <
+ MaxStoreLoadForwardSafeVF.first &&
+ MaxVFWithoutSLForwardIssuesPowerOf2 !=
+ VectorizerParams::MaxVectorWidth * TypeByteSize)
+ MaxStoreLoadForwardSafeVF.first = MaxVFWithoutSLForwardIssuesPowerOf2;
+ if (MaxVFWithoutSLForwardIssuesNonPowerOf2 < 2 * TypeByteSize)
+ MaxStoreLoadForwardSafeVF.second = 1;
+ else if (MaxVFWithoutSLForwardIssuesNonPowerOf2 <
+ MaxStoreLoadForwardSafeVF.second &&
+ MaxVFWithoutSLForwardIssuesNonPowerOf2 !=
+ VectorizerParams::MaxVectorWidth * TypeByteSize)
+ MaxStoreLoadForwardSafeVF.second = MaxVFWithoutSLForwardIssuesNonPowerOf2;
return false;
}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 09ca859f52680..28e814e9c89e9 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1444,9 +1444,8 @@ class LoopVectorizationCostModel {
/// Selects and saves TailFoldingStyle for 2 options - if IV update may
/// overflow or not.
- /// \param IsScalableVF true if scalable vector factors enabled.
/// \param UserIC User specific interleave count.
- void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
+ void setTailFoldingStyles(unsigned UserIC) {
assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
if (!Legal->canFoldTailByMasking()) {
ChosenTailFoldingStyle =
@@ -1470,11 +1469,9 @@ class LoopVectorizationCostModel {
// FIXME: use actual opcode/data type for analysis here.
// FIXME: Investigate opportunity for fixed vector factor.
bool EVLIsLegal =
- IsScalableVF && UserIC <= 1 &&
+ UserIC <= 1 &&
TTI.hasActiveVectorLength(0, nullptr, Align()) &&
- !EnableVPlanNativePath &&
- // FIXME: implement support for max safe dependency distance.
- Legal->isSafeForAnyVectorWidth();
+ !EnableVPlanNativePath;
if (!EVLIsLegal) {
// If for some reason EVL mode is unsupported, fallback to
// DataWithoutLaneMask to try to vectorize the loop with folded tail
@@ -1492,6 +1489,14 @@ class LoopVectorizationCostModel {
}
}
+ /// Disables previously chosen tail folding policy, sets it to None. Expects,
+ /// that the tail policy was selected.
+ void disableTailFolding() {
+ assert(ChosenTailFoldingStyle && "Tail folding must be selected.");
+ ChosenTailFoldingStyle =
+ std::make_pair(TailFoldingStyle::None, TailFoldingStyle::None);
+ }
+
/// Returns true if all loop blocks should be masked to fold tail loop.
bool foldTailByMasking() const {
// TODO: check if it is possible to check for None style independent of
@@ -1499,6 +1504,14 @@ class LoopVectorizationCostModel {
return getTailFoldingStyle() != TailFoldingStyle::None;
}
+ /// Return maximum safe number of elements to be processed, which do not
+ /// prevent store-load forwarding.
+ /// TODO: need to consider adjusting cost model to use this value as a
+ /// vectorization factor for EVL-based vectorization.
+ std::optional<unsigned> getMaxEVLSafeElements() const {
+ return MaxEVLSafeElements;
+ }
+
/// Returns true if the instructions in this block requires predication
/// for any reason, e.g. because tail folding now requires a predicate
/// or because the block in the original loop was predicated.
@@ -1654,6 +1667,10 @@ class LoopVectorizationCostModel {
/// true if scalable vectorization is supported and enabled.
std::optional<bool> IsScalableVectorizationAllowed;
+ /// Maximum safe number of elements to be processed, which do not
+ /// prevent store-load forwarding.
+ std::optional<unsigned> MaxEVLSafeElements;
+
/// A map holding scalar costs for different vectorization factors. The
/// presence of a cost for an instruction in the mapping indicates that the
/// instruction will be scalarized when vectorizing with the associated
@@ -3903,11 +3920,31 @@ FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
// It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
// the memory accesses that is most restrictive (involved in the smallest
// dependence distance).
- unsigned MaxSafeElements =
- llvm::bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
+ unsigned MaxSafeElements = Legal->getMaxSafeVectorWidthInBits() / WidestType;
+ if (Legal->isSafeForAnyVectorWidth())
+ MaxSafeElements = PowerOf2Ceil(MaxSafeElements);
+ unsigned MaxFixedSafeElements = std::gcd(
+ MaxSafeElements,
+ Legal->getMaxStoreLoadForwardSafeVFPowerOf2().value_or(MaxSafeElements));
+ MaxFixedSafeElements = bit_floor(MaxFixedSafeElements);
+ unsigned MaxScalableSafeElements = MaxFixedSafeElements;
+ if (foldTailWithEVL()) {
+ MaxScalableSafeElements = std::numeric_limits<unsigned>::max();
+ std::optional<unsigned> SafeStoreLoadForwarding =
+ Legal->getMaxStoreLoadForwardSafeVFNonPowerOf2();
+ if (!Legal->isSafeForAnyVectorWidth() || SafeStoreLoadForwarding) {
+ unsigned SLForwardDist =
+ Legal->getMaxStoreLoadForwardSafeVFNonPowerOf2().value_or(
+ MaxSafeElements);
+ if (MaxSafeElements >= SLForwardDist)
+ MaxEVLSafeElements = SLForwardDist;
+ else
+ MaxEVLSafeElements = std::gcd(MaxSafeElements, SLForwardDist);
+ }
+ }
- auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
- auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
+ auto MaxSafeFixedVF = ElementCount::getFixed(MaxFixedSafeElements);
+ auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxScalableSafeElements);
LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
<< ".\n");
@@ -4077,7 +4114,13 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
}
- FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
+ // If we don't know the precise trip count, or if the trip count that we
+ // found modulo the vectorization factor is not zero, try to fold the tail
+ // by masking.
+ // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
+ setTailFoldingStyles(UserIC);
+ FixedScalableVFPair MaxFactors =
+ computeFeasibleMaxVF(MaxTC, UserVF, foldTailByMasking());
// Avoid tail folding if the trip count is known to be a multiple of any VF
// we choose.
@@ -4108,15 +4151,11 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
if (Rem->isZero()) {
// Accept MaxFixedVF if we do not have a tail.
LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
+ disableTailFolding();
return MaxFactors;
}
}
- // If we don't know the precise trip count, or if the trip count that we
- // found modulo the vectorization factor is not zero, try to fold the tail
- // by masking.
- // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
- setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC);
if (foldTailByMasking()) {
if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) {
LLVM_DEBUG(
@@ -8388,8 +8427,8 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
VPlanTransforms::optimize(*Plan, *PSE.getSE());
// TODO: try to put it close to addActiveLaneMask().
// Discard the plan if it is not EVL-compatible
- if (CM.foldTailWithEVL() &&
- !VPlanTransforms::tryAddExplicitVectorLength(*Plan))
+ if (CM.foldTailWithEVL() && !VPlanTransforms::tryAddExplicitVectorLength(
+ *Plan, CM.getMaxEVLSafeElements()))
break;
assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
VPlans.push_back(std::move(Plan));
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 2d6d67a55c17d..de24688593ebe 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -471,6 +471,11 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
assert(State.VF.isScalable() && "Expected scalable vector factor.");
Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue());
+ if (getNumOperands() == 3) {
+ Value *MaxSafeVF = State.get(getOperand(2), VPIteration(0, 0));
+ AVL = State.Builder.CreateBinaryIntrinsic(Intrinsic::umin, AVL,
+ MaxSafeVF);
+ }
Value *EVL = State.Builder.CreateIntrinsic(
State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
{AVL, VFArg, State.Builder.getTrue()});
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index c91fd0f118e31..e703bb893d938 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1427,7 +1427,8 @@ void VPlanTransforms::addActiveLaneMask(
/// %NextEVLIV = add IVSize (cast i32 %VPEVVL to IVSize), %EVLPhi
/// ...
///
-bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) {
+bool VPlanTransforms::tryAddExplicitVectorLength(
+ VPlan &Plan, const std::optional<unsigned> &MaxEVLSafeElements) {
VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock();
// The transform updates all users of inductions to work based on EVL, instead
// of the VF directly. At the moment, widened inductions cannot be updated, so
@@ -1452,8 +1453,12 @@ bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) {
// Create the ExplicitVectorLengthPhi recipe in the main loop.
auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc());
EVLPhi->insertAfter(CanonicalIVPHI);
- auto *VPEVL = new VPInstruction(VPInstruction::ExplicitVectorLength,
- {EVLPhi, Plan.getTripCount()});
+ SmallVector<VPValue *, 3> Operands = {EVLPhi, Plan.getTripCount()};
+ if (MaxEVLSafeElements)
+ Operands.push_back(Plan.getOrAddLiveIn(ConstantInt::get(
+ CanonicalIVPHI->getScalarType(), *MaxEVLSafeElements)));
+ auto *VPEVL = new VPInstruction(VPInstruction::ExplicitVectorLength, Operands,
+ DebugLoc());
VPEVL->insertBefore(*Header, Header->getFirstNonPhi());
auto *CanonicalIVIncrement =
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 96b8a6639723c..8158c832f1a95 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -105,7 +105,9 @@ struct VPlanTransforms {
/// VPCanonicalIVPHIRecipe is only used to control the loop after
/// this transformation.
/// \returns true if the transformation succeeds, or false if it doesn't.
- static bool tryAddExplicitVectorLength(VPlan &Plan);
+ static bool
+ tryAddExplicitVectorLength(VPlan &Plan,
+ const std::optional<unsigned> &MaxEVLSafeElements);
};
} // namespace llvm
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll b/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll
index 81d8b01fe7fb7..c5ba25a5c0ace 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll
@@ -140,11 +140,11 @@ define void @neg_dist_dep_type_size_equivalence(ptr nocapture %vec, i64 %n) {
; CHECK-NEXT: %ld.i64 = load i64, ptr %gep.iv, align 8 ->
; CHECK-NEXT: store i32 %ld.i64.i32, ptr %gep.iv.n.i64, align 8
; CHECK-EMPTY:
-; CHECK-NEXT: BackwardVectorizableButPreventsForwarding:
+; CHECK-NEXT: BackwardVectorizable:
; CHECK-NEXT: %ld.f64 = load double, ptr %gep.iv, align 8 ->
; CHECK-NEXT: store double %val, ptr %gep.iv.101.i64, align 8
; CHECK-EMPTY:
-; CHECK-NEXT: ForwardButPreventsForwarding:
+; CHECK-NEXT: Forward:
; CHECK-NEXT: store double %val, ptr %gep.iv.101.i64, align 8 ->
; CHECK-NEXT: %ld.i64 = load i64, ptr %gep.iv, align 8
; CHECK-EMPTY:
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-independent.ll b/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-independent.ll
index 7fc9958dba552..6e4bcec013a73 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-independent.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-independent.ll
@@ -24,14 +24,13 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
define void @f(ptr noalias %A, ptr noalias %B, ptr noalias %C, i64 %N) {
; CHECK-LABEL: 'f'
; CHECK-NEXT: for.body:
-; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT: Forward loop carried data dependence that prevents store-to-load forwarding.
+; CHECK-NEXT: Memory dependences are safe
; CHECK-NEXT: Dependences:
; CHECK-NEXT: Forward:
; CHECK-NEXT: store i32 %b_p1, ptr %Aidx, align 4 ->
; CHECK-NEXT: %a = load i32, ptr %Aidx, align 4
; CHECK-EMPTY:
-; CHECK-NEXT: ForwardButPreventsForwarding:
+; CHECK-NEXT: Forward:
; CHECK-NEXT: store i32 %b_p2, ptr %Aidx_next, align 4 ->
; CHECK-NEXT: %a = load i32, ptr %Aidx, align 4
; CHECK-EMPTY:
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/forward-negative-step.ll b/llvm/test/Analysis/LoopAccessAnalysis/forward-negative-step.ll
...
[truncated]
|
@llvm/pr-subscribers-llvm-analysis Author: Alexey Bataev (alexey-bataev) ChangesDataWithEVL tail folded loops still use scalable vectorization with Patch is 59.15 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/100755.diff 16 Files Affected:
diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
index cc40d2e83f2e0..b661e117d01ee 100644
--- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -37,6 +37,8 @@ class Value;
struct VectorizerParams {
/// Maximum SIMD width.
static const unsigned MaxVectorWidth;
+ /// Maximum LMUL factor.
+ static const unsigned MaxVectorLMUL;
/// VF as overridden by the user.
static unsigned VectorizationFactor;
@@ -222,6 +224,23 @@ class MemoryDepChecker {
return MaxSafeVectorWidthInBits;
}
+ /// Return safe power-of-2 number of elements, which do not prevent store-load
+ /// forwarding.
+ std::optional<uint64_t> getStoreLoadForwardSafeVFPowerOf2() const {
+ if (MaxStoreLoadForwardSafeVF.first == std::numeric_limits<uint64_t>::max())
+ return std::nullopt;
+ return MaxStoreLoadForwardSafeVF.first;
+ }
+
+ /// Return safe non-power-of-2 number of elements, which do not prevent
+ /// store-load forwarding.
+ std::optional<uint64_t> getStoreLoadForwardSafeVFNonPowerOf2() const {
+ if (MaxStoreLoadForwardSafeVF.second ==
+ std::numeric_limits<uint64_t>::max())
+ return std::nullopt;
+ return MaxStoreLoadForwardSafeVF.second;
+ }
+
/// In same cases when the dependency check fails we can still
/// vectorize the loop with a dynamic array access check.
bool shouldRetryWithRuntimeCheck() const {
@@ -310,6 +329,12 @@ class MemoryDepChecker {
/// restrictive.
uint64_t MaxSafeVectorWidthInBits = -1U;
+ /// Maximum number of elements (power-of-2 and non-power-of-2), which do not
+ /// prevent store-load forwarding.
+ std::pair<uint64_t, uint64_t> MaxStoreLoadForwardSafeVF =
+ std::make_pair(std::numeric_limits<uint64_t>::max(),
+ std::numeric_limits<uint64_t>::max());
+
/// If we see a non-constant dependence distance we can still try to
/// vectorize this loop with runtime checks.
bool FoundNonConstantDistanceDependence = false;
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
index 0f4d1355dd2bf..c16a5f9a1344c 100644
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -377,6 +377,18 @@ class LoopVectorizationLegality {
return LAI->getDepChecker().getMaxSafeVectorWidthInBits();
}
+ /// Return safe power-of-2 number of elements, which do not prevent store-load
+ /// forwarding.
+ std::optional<unsigned> getMaxStoreLoadForwardSafeVFPowerOf2() const {
+ return LAI->getDepChecker().getStoreLoadForwardSafeVFPowerOf2();
+ }
+
+ /// Return safe non-power-of-2 number of elements, which do not prevent
+ /// store-load forwarding.
+ std::optional<unsigned> getMaxStoreLoadForwardSafeVFNonPowerOf2() const {
+ return LAI->getDepChecker().getStoreLoadForwardSafeVFNonPowerOf2();
+ }
+
/// Returns true if vector representation of the instruction \p I
/// requires mask.
bool isMaskRequired(const Instruction *I) const {
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 646d2f7ef3077..29816bd1d845c 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -100,6 +100,8 @@ static cl::opt<unsigned> MemoryCheckMergeThreshold(
/// Maximum SIMD width.
const unsigned VectorizerParams::MaxVectorWidth = 64;
+/// Maximum LMUL factor.
+const unsigned VectorizerParams::MaxVectorLMUL = 8;
/// We collect dependences up to this threshold.
static cl::opt<unsigned>
@@ -1764,31 +1766,64 @@ bool MemoryDepChecker::couldPreventStoreLoadForward(uint64_t Distance,
// cause any slowdowns.
const uint64_t NumItersForStoreLoadThroughMemory = 8 * TypeByteSize;
// Maximum vector factor.
- uint64_t MaxVFWithoutSLForwardIssues = std::min(
- VectorizerParams::MaxVectorWidth * TypeByteSize, MinDepDistBytes);
+ uint64_t MaxVFWithoutSLForwardIssuesPowerOf2 =
+ std::min(VectorizerParams::MaxVectorWidth * TypeByteSize,
+ MaxStoreLoadForwardSafeVF.first);
+ uint64_t MaxVFWithoutSLForwardIssuesNonPowerOf2 =
+ std::min(VectorizerParams::MaxVectorLMUL *
+ VectorizerParams::MaxVectorWidth * TypeByteSize,
+ MaxStoreLoadForwardSafeVF.second);
// Compute the smallest VF at which the store and load would be misaligned.
- for (uint64_t VF = 2 * TypeByteSize; VF <= MaxVFWithoutSLForwardIssues;
- VF *= 2) {
+ for (uint64_t VF = 2 * TypeByteSize;
+ VF <= MaxVFWithoutSLForwardIssuesPowerOf2; VF *= 2) {
// If the number of vector iteration between the store and the load are
// small we could incur conflicts.
if (Distance % VF && Distance / VF < NumItersForStoreLoadThroughMemory) {
- MaxVFWithoutSLForwardIssues = (VF >> 1);
+ MaxVFWithoutSLForwardIssuesPowerOf2 = (VF >> 1);
+ break;
+ }
+ }
+ // RISCV VLA supports non-power-2 vector factor. So, we iterate in a
+ // backward order to find largest VF, which allows aligned stores-loads or
+ // the number of iterations between conflicting memory addresses is not less
+ // than 8 (NumItersForStoreLoadThroughMemory).
+ for (uint64_t VF = MaxVFWithoutSLForwardIssuesNonPowerOf2,
+ E = 2 * TypeByteSize;
+ VF >= E; VF -= TypeByteSize) {
+ if (Distance % VF == 0 ||
+ Distance / VF >= NumItersForStoreLoadThroughMemory) {
+ uint64_t GCD = MaxStoreLoadForwardSafeVF.second ==
+ std::numeric_limits<uint64_t>::max()
+ ? VF
+ : std::gcd(MaxStoreLoadForwardSafeVF.second, VF);
+ MaxVFWithoutSLForwardIssuesNonPowerOf2 = GCD;
break;
}
}
- if (MaxVFWithoutSLForwardIssues < 2 * TypeByteSize) {
+ if (MaxVFWithoutSLForwardIssuesPowerOf2 < 2 * TypeByteSize &&
+ MaxVFWithoutSLForwardIssuesNonPowerOf2 < 2 * TypeByteSize) {
LLVM_DEBUG(
dbgs() << "LAA: Distance " << Distance
<< " that could cause a store-load forwarding conflict\n");
return true;
}
- if (MaxVFWithoutSLForwardIssues < MinDepDistBytes &&
- MaxVFWithoutSLForwardIssues !=
- VectorizerParams::MaxVectorWidth * TypeByteSize)
- MinDepDistBytes = MaxVFWithoutSLForwardIssues;
+ if (MaxVFWithoutSLForwardIssuesPowerOf2 < 2 * TypeByteSize)
+ MaxStoreLoadForwardSafeVF.first = 1;
+ else if (MaxVFWithoutSLForwardIssuesPowerOf2 <
+ MaxStoreLoadForwardSafeVF.first &&
+ MaxVFWithoutSLForwardIssuesPowerOf2 !=
+ VectorizerParams::MaxVectorWidth * TypeByteSize)
+ MaxStoreLoadForwardSafeVF.first = MaxVFWithoutSLForwardIssuesPowerOf2;
+ if (MaxVFWithoutSLForwardIssuesNonPowerOf2 < 2 * TypeByteSize)
+ MaxStoreLoadForwardSafeVF.second = 1;
+ else if (MaxVFWithoutSLForwardIssuesNonPowerOf2 <
+ MaxStoreLoadForwardSafeVF.second &&
+ MaxVFWithoutSLForwardIssuesNonPowerOf2 !=
+ VectorizerParams::MaxVectorWidth * TypeByteSize)
+ MaxStoreLoadForwardSafeVF.second = MaxVFWithoutSLForwardIssuesNonPowerOf2;
return false;
}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 09ca859f52680..28e814e9c89e9 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1444,9 +1444,8 @@ class LoopVectorizationCostModel {
/// Selects and saves TailFoldingStyle for 2 options - if IV update may
/// overflow or not.
- /// \param IsScalableVF true if scalable vector factors enabled.
/// \param UserIC User specific interleave count.
- void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
+ void setTailFoldingStyles(unsigned UserIC) {
assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
if (!Legal->canFoldTailByMasking()) {
ChosenTailFoldingStyle =
@@ -1470,11 +1469,9 @@ class LoopVectorizationCostModel {
// FIXME: use actual opcode/data type for analysis here.
// FIXME: Investigate opportunity for fixed vector factor.
bool EVLIsLegal =
- IsScalableVF && UserIC <= 1 &&
+ UserIC <= 1 &&
TTI.hasActiveVectorLength(0, nullptr, Align()) &&
- !EnableVPlanNativePath &&
- // FIXME: implement support for max safe dependency distance.
- Legal->isSafeForAnyVectorWidth();
+ !EnableVPlanNativePath;
if (!EVLIsLegal) {
// If for some reason EVL mode is unsupported, fallback to
// DataWithoutLaneMask to try to vectorize the loop with folded tail
@@ -1492,6 +1489,14 @@ class LoopVectorizationCostModel {
}
}
+ /// Disables previously chosen tail folding policy, sets it to None. Expects,
+ /// that the tail policy was selected.
+ void disableTailFolding() {
+ assert(ChosenTailFoldingStyle && "Tail folding must be selected.");
+ ChosenTailFoldingStyle =
+ std::make_pair(TailFoldingStyle::None, TailFoldingStyle::None);
+ }
+
/// Returns true if all loop blocks should be masked to fold tail loop.
bool foldTailByMasking() const {
// TODO: check if it is possible to check for None style independent of
@@ -1499,6 +1504,14 @@ class LoopVectorizationCostModel {
return getTailFoldingStyle() != TailFoldingStyle::None;
}
+ /// Return maximum safe number of elements to be processed, which do not
+ /// prevent store-load forwarding.
+ /// TODO: need to consider adjusting cost model to use this value as a
+ /// vectorization factor for EVL-based vectorization.
+ std::optional<unsigned> getMaxEVLSafeElements() const {
+ return MaxEVLSafeElements;
+ }
+
/// Returns true if the instructions in this block requires predication
/// for any reason, e.g. because tail folding now requires a predicate
/// or because the block in the original loop was predicated.
@@ -1654,6 +1667,10 @@ class LoopVectorizationCostModel {
/// true if scalable vectorization is supported and enabled.
std::optional<bool> IsScalableVectorizationAllowed;
+ /// Maximum safe number of elements to be processed, which do not
+ /// prevent store-load forwarding.
+ std::optional<unsigned> MaxEVLSafeElements;
+
/// A map holding scalar costs for different vectorization factors. The
/// presence of a cost for an instruction in the mapping indicates that the
/// instruction will be scalarized when vectorizing with the associated
@@ -3903,11 +3920,31 @@ FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
// It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
// the memory accesses that is most restrictive (involved in the smallest
// dependence distance).
- unsigned MaxSafeElements =
- llvm::bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
+ unsigned MaxSafeElements = Legal->getMaxSafeVectorWidthInBits() / WidestType;
+ if (Legal->isSafeForAnyVectorWidth())
+ MaxSafeElements = PowerOf2Ceil(MaxSafeElements);
+ unsigned MaxFixedSafeElements = std::gcd(
+ MaxSafeElements,
+ Legal->getMaxStoreLoadForwardSafeVFPowerOf2().value_or(MaxSafeElements));
+ MaxFixedSafeElements = bit_floor(MaxFixedSafeElements);
+ unsigned MaxScalableSafeElements = MaxFixedSafeElements;
+ if (foldTailWithEVL()) {
+ MaxScalableSafeElements = std::numeric_limits<unsigned>::max();
+ std::optional<unsigned> SafeStoreLoadForwarding =
+ Legal->getMaxStoreLoadForwardSafeVFNonPowerOf2();
+ if (!Legal->isSafeForAnyVectorWidth() || SafeStoreLoadForwarding) {
+ unsigned SLForwardDist =
+ Legal->getMaxStoreLoadForwardSafeVFNonPowerOf2().value_or(
+ MaxSafeElements);
+ if (MaxSafeElements >= SLForwardDist)
+ MaxEVLSafeElements = SLForwardDist;
+ else
+ MaxEVLSafeElements = std::gcd(MaxSafeElements, SLForwardDist);
+ }
+ }
- auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
- auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
+ auto MaxSafeFixedVF = ElementCount::getFixed(MaxFixedSafeElements);
+ auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxScalableSafeElements);
LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
<< ".\n");
@@ -4077,7 +4114,13 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
}
- FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTC, UserVF, true);
+ // If we don't know the precise trip count, or if the trip count that we
+ // found modulo the vectorization factor is not zero, try to fold the tail
+ // by masking.
+ // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
+ setTailFoldingStyles(UserIC);
+ FixedScalableVFPair MaxFactors =
+ computeFeasibleMaxVF(MaxTC, UserVF, foldTailByMasking());
// Avoid tail folding if the trip count is known to be a multiple of any VF
// we choose.
@@ -4108,15 +4151,11 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
if (Rem->isZero()) {
// Accept MaxFixedVF if we do not have a tail.
LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
+ disableTailFolding();
return MaxFactors;
}
}
- // If we don't know the precise trip count, or if the trip count that we
- // found modulo the vectorization factor is not zero, try to fold the tail
- // by masking.
- // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
- setTailFoldingStyles(MaxFactors.ScalableVF.isScalable(), UserIC);
if (foldTailByMasking()) {
if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) {
LLVM_DEBUG(
@@ -8388,8 +8427,8 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
VPlanTransforms::optimize(*Plan, *PSE.getSE());
// TODO: try to put it close to addActiveLaneMask().
// Discard the plan if it is not EVL-compatible
- if (CM.foldTailWithEVL() &&
- !VPlanTransforms::tryAddExplicitVectorLength(*Plan))
+ if (CM.foldTailWithEVL() && !VPlanTransforms::tryAddExplicitVectorLength(
+ *Plan, CM.getMaxEVLSafeElements()))
break;
assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
VPlans.push_back(std::move(Plan));
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 2d6d67a55c17d..de24688593ebe 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -471,6 +471,11 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) {
assert(State.VF.isScalable() && "Expected scalable vector factor.");
Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue());
+ if (getNumOperands() == 3) {
+ Value *MaxSafeVF = State.get(getOperand(2), VPIteration(0, 0));
+ AVL = State.Builder.CreateBinaryIntrinsic(Intrinsic::umin, AVL,
+ MaxSafeVF);
+ }
Value *EVL = State.Builder.CreateIntrinsic(
State.Builder.getInt32Ty(), Intrinsic::experimental_get_vector_length,
{AVL, VFArg, State.Builder.getTrue()});
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index c91fd0f118e31..e703bb893d938 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1427,7 +1427,8 @@ void VPlanTransforms::addActiveLaneMask(
/// %NextEVLIV = add IVSize (cast i32 %VPEVVL to IVSize), %EVLPhi
/// ...
///
-bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) {
+bool VPlanTransforms::tryAddExplicitVectorLength(
+ VPlan &Plan, const std::optional<unsigned> &MaxEVLSafeElements) {
VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock();
// The transform updates all users of inductions to work based on EVL, instead
// of the VF directly. At the moment, widened inductions cannot be updated, so
@@ -1452,8 +1453,12 @@ bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) {
// Create the ExplicitVectorLengthPhi recipe in the main loop.
auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc());
EVLPhi->insertAfter(CanonicalIVPHI);
- auto *VPEVL = new VPInstruction(VPInstruction::ExplicitVectorLength,
- {EVLPhi, Plan.getTripCount()});
+ SmallVector<VPValue *, 3> Operands = {EVLPhi, Plan.getTripCount()};
+ if (MaxEVLSafeElements)
+ Operands.push_back(Plan.getOrAddLiveIn(ConstantInt::get(
+ CanonicalIVPHI->getScalarType(), *MaxEVLSafeElements)));
+ auto *VPEVL = new VPInstruction(VPInstruction::ExplicitVectorLength, Operands,
+ DebugLoc());
VPEVL->insertBefore(*Header, Header->getFirstNonPhi());
auto *CanonicalIVIncrement =
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 96b8a6639723c..8158c832f1a95 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -105,7 +105,9 @@ struct VPlanTransforms {
/// VPCanonicalIVPHIRecipe is only used to control the loop after
/// this transformation.
/// \returns true if the transformation succeeds, or false if it doesn't.
- static bool tryAddExplicitVectorLength(VPlan &Plan);
+ static bool
+ tryAddExplicitVectorLength(VPlan &Plan,
+ const std::optional<unsigned> &MaxEVLSafeElements);
};
} // namespace llvm
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll b/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll
index 81d8b01fe7fb7..c5ba25a5c0ace 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/depend_diff_types.ll
@@ -140,11 +140,11 @@ define void @neg_dist_dep_type_size_equivalence(ptr nocapture %vec, i64 %n) {
; CHECK-NEXT: %ld.i64 = load i64, ptr %gep.iv, align 8 ->
; CHECK-NEXT: store i32 %ld.i64.i32, ptr %gep.iv.n.i64, align 8
; CHECK-EMPTY:
-; CHECK-NEXT: BackwardVectorizableButPreventsForwarding:
+; CHECK-NEXT: BackwardVectorizable:
; CHECK-NEXT: %ld.f64 = load double, ptr %gep.iv, align 8 ->
; CHECK-NEXT: store double %val, ptr %gep.iv.101.i64, align 8
; CHECK-EMPTY:
-; CHECK-NEXT: ForwardButPreventsForwarding:
+; CHECK-NEXT: Forward:
; CHECK-NEXT: store double %val, ptr %gep.iv.101.i64, align 8 ->
; CHECK-NEXT: %ld.i64 = load i64, ptr %gep.iv, align 8
; CHECK-EMPTY:
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-independent.ll b/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-independent.ll
index 7fc9958dba552..6e4bcec013a73 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-independent.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/forward-loop-independent.ll
@@ -24,14 +24,13 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
define void @f(ptr noalias %A, ptr noalias %B, ptr noalias %C, i64 %N) {
; CHECK-LABEL: 'f'
; CHECK-NEXT: for.body:
-; CHECK-NEXT: Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT: Forward loop carried data dependence that prevents store-to-load forwarding.
+; CHECK-NEXT: Memory dependences are safe
; CHECK-NEXT: Dependences:
; CHECK-NEXT: Forward:
; CHECK-NEXT: store i32 %b_p1, ptr %Aidx, align 4 ->
; CHECK-NEXT: %a = load i32, ptr %Aidx, align 4
; CHECK-EMPTY:
-; CHECK-NEXT: ForwardButPreventsForwarding:
+; CHECK-NEXT: Forward:
; CHECK-NEXT: store i32 %b_p2, ptr %Aidx_next, align 4 ->
; CHECK-NEXT: %a = load i32, ptr %Aidx, align 4
; CHECK-EMPTY:
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/forward-negative-step.ll b/llvm/test/Analysis/LoopAccessAnalysis/forward-negative-step.ll
...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
Created using spr 1.3.5
Ping! |
1 similar comment
Ping! |
@@ -37,6 +37,8 @@ class Value; | |||
struct VectorizerParams { | |||
/// Maximum SIMD width. | |||
static const unsigned MaxVectorWidth; | |||
/// Maximum LMUL factor. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LMUL is a RISCV specific term, would be good to frame in a general term if possible?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
+1, how about UF?
Note that UF in general may be a non-power-of-2, whereas LMUL is, iiuc.
@@ -100,6 +100,8 @@ static cl::opt<unsigned> MemoryCheckMergeThreshold( | |||
|
|||
/// Maximum SIMD width. | |||
const unsigned VectorizerParams::MaxVectorWidth = 64; | |||
/// Maximum LMUL factor. | |||
const unsigned VectorizerParams::MaxVectorLMUL = 8; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this upper bound RISCV specific?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
AFAIU yes, stemming from RISCVV having 32 architected vector registers, which become 4 with LMUL = 8. According to https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc, LMUL can be 1, 2, 4, 8 and also 1/2, 1/4, 1/8.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Currently, yes. I think we can sink it to TTI later if some support for other targets is required.
@@ -140,11 +140,11 @@ define void @neg_dist_dep_type_size_equivalence(ptr nocapture %vec, i64 %n) { | |||
; CHECK-NEXT: %ld.i64 = load i64, ptr %gep.iv, align 8 -> | |||
; CHECK-NEXT: store i32 %ld.i64.i32, ptr %gep.iv.n.i64, align 8 | |||
; CHECK-EMPTY: | |||
; CHECK-NEXT: BackwardVectorizableButPreventsForwarding: | |||
; CHECK-NEXT: BackwardVectorizable: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Those are all undesired functional changes that don't seem directly related to the goal of the patch?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is unfortunate but true. The problem is that the compiler could decide during loop access analysis, that if distance is not power of 2, the loop cannot be vectorized. With this patch, we cannot do this anymore, we do not know how the loop is going to be vectorized, using power-of-2 fixed vector length or scalable vectorization with non-power-of-2 support.
break; | ||
} | ||
} | ||
// RISCV VLA supports non-power-2 vector factor. So, we iterate in a |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this needed for correctness for RISCV? If not, can be done separately as this adds some extra complexity.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Initially, we can support only power-of-2. I can split this patch into 2 sub-patches.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes please, this would help to make the patch simpler hopefully.
Also MaxStoreLoadForwardSafeVF
is not really a legality constraint but a cost constraint (to prevent cases where the HW support Store to load forwarding, which may be faster than a vector loop not allowing for store->load forwarding).
Is this relevant for cores supporting EVL?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The initial patch with only power-of-2 support is committed already, this one adds non-power-of-2
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also MaxStoreLoadForwardSafeVF is not really a legality constraint but a cost constraint (to prevent cases where the HW support Store to load forwarding, which may be faster than a vector loop not allowing for store->load forwarding).
Right.
Is this relevant for cores supporting EVL?
Yes, but it supports non-power-of-2 sizes due to its nature
@@ -471,6 +471,11 @@ Value *VPInstruction::generatePerPart(VPTransformState &State, unsigned Part) { | |||
assert(State.VF.isScalable() && "Expected scalable vector factor."); | |||
Value *VFArg = State.Builder.getInt32(State.VF.getKnownMinValue()); | |||
|
|||
if (getNumOperands() == 3) { | |||
Value *MaxSafeVF = State.get(getOperand(2), VPIteration(0, 0)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Document ExplicitVectorLength opcode in VPlan.h wit additional optional operand)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Will do
Ping! |
3 similar comments
Ping! |
Ping! |
Ping! |
break; | ||
} | ||
} | ||
// RISCV VLA supports non-power-2 vector factor. So, we iterate in a |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes please, this would help to make the patch simpler hopefully.
Also MaxStoreLoadForwardSafeVF
is not really a legality constraint but a cost constraint (to prevent cases where the HW support Store to load forwarding, which may be faster than a vector loop not allowing for store->load forwarding).
Is this relevant for cores supporting EVL?
Created using spr 1.3.5
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is quite hard to fully understand what is going on exactly in the code changes. There are some test changes in non EVL tests and tests that do not have any store-load forward dependencies, is this expected?
The patch seems to do 3 separate things
- consider MaxStoreLoadForwardSafeVFPower2
- consider MaxStoreLoadForwardSafeVFNonPower2
- changes behavior in non EVL mode and EVL mode without store-load forward issues?
Can we limit it to MaxStoreLoadForwardSafeVFPower2 initially, to reduce the complexity?
Will the cost model also need to be updated to consider the smaller VF?
@@ -226,7 +226,7 @@ for.end: | |||
|
|||
;Check the new calculation of the maximum safe distance in bits which can be vectorized. | |||
;The previous behavior did not take account that the stride was 2. | |||
;Therefore the maxVF was computed as 8 instead of 4, as the dependence distance here is 6 iterations, given by |N-(N-12)|/2. | |||
;Therefore the maxVF was computed as 8 instead of 2, as the dependence distance here is 6 iterations, given by |N-(N-12)|/2. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Change in existing behavior?
@@ -24,65 +24,17 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 | |||
define void @maxvf3() { | |||
; CHECK-LABEL: @maxvf3( | |||
; CHECK-NEXT: entry: | |||
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
change in existing behavior?
// Accept MaxFixedVF if we do not have a tail. | ||
LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); | ||
return MaxFactors; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why change this to not exit early? No tail is needed but we still set the tail-folding style?
@@ -192,17 +207,52 @@ exit: | |||
define void @test_may_clobber2(ptr %p) { | |||
; IF-EVL-LABEL: @test_may_clobber2( | |||
; IF-EVL-NEXT: entry: | |||
; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I have a hard time understanding how this change is related to the MaxStoreLoadForwardSafe changes; there is no store load forward issue here I think?
/// Maximum unroll factor. Can represent actual unroll factor and/or some | ||
/// other target-specific features, like LMUL factor for RISC-V with RVV | ||
/// support. | ||
static const unsigned MaxVectorUF; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
At the moment, we never interleave in EVL mode, why do we need to account for LMUL here?
@@ -328,6 +350,9 @@ class MemoryDepChecker { | |||
/// backwards-vectorizable or unknown (triggering a runtime check). | |||
unsigned MaxTargetVectorWidthInBits = 0; | |||
|
|||
/// True if current target supports non-power-of-2 dependence distances. | |||
bool AllowNonPow2Deps = false; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is confusing, in general any target supports non-power-of-2 dependence distances, if >= VF?
@@ -304,6 +321,11 @@ class MemoryDepChecker { | |||
/// restrictive. | |||
uint64_t MaxSafeVectorWidthInBits = -1U; | |||
|
|||
/// Maximum number of elements (power-of-2 and non-power-of-2), which do not | |||
/// prevent store-load forwarding and safe to operate simultaneously. | |||
std::pair<std::optional<uint64_t>, std::optional<uint64_t>> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why do we need to distinguish between non-power and power of 2 here, can't we just use the max of both?
MaxSafeElementsPowerOf2); | ||
} | ||
MaxSafeScalableVF = | ||
getMaxLegalScalableVF(std::numeric_limits<unsigned>::max()); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why ignore MaxSafeElements here?
This will somehow set the MaxSaclableVF to non-power-of-2?
…llvm#121156) The patch splits the store-load forwarding distance analysis from other dependency analysis in LAA. Currently it supports only power-of-2 distances, required to support non-power-of-2 distances in future. Part of llvm#100755
DataWithEVL tail folded loops still use scalable vectorization with
the special check for max safe distance, which allows to support
non-power-of-2 distances.
The patch does extra analysis for the max store-load forwarding distance
to allow non-power-of-2 distance. Because of that it has to relax some
checks, because the compiler does not know yet, how the loop will be
vectorized (in EVL predicated mode or not).