-
Notifications
You must be signed in to change notification settings - Fork 14k
[LV][EVL] Generate negative strided load/store for reversed load/store #123608
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[LV][EVL] Generate negative strided load/store for reversed load/store #123608
Conversation
@llvm/pr-subscribers-backend-risc-v @llvm/pr-subscribers-llvm-transforms Author: Pengcheng Wang (wangpc-pp) ChangesThis can reduce the operations to reverse mask, load result Full diff: https://github.com/llvm/llvm-project/pull/123608.diff 3 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index aa5f92b235555e..587c7e9b4417fa 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2603,17 +2603,6 @@ void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent,
}
#endif
-/// Use all-true mask for reverse rather than actual mask, as it avoids a
-/// dependence w/o affecting the result.
-static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand,
- Value *EVL, const Twine &Name) {
- VectorType *ValTy = cast<VectorType>(Operand->getType());
- Value *AllTrueMask =
- Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue());
- return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse,
- {Operand, AllTrueMask, EVL}, nullptr, Name);
-}
-
void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
auto *LI = cast<LoadInst>(&Ingredient);
@@ -2630,8 +2619,6 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
Value *Mask = nullptr;
if (VPValue *VPMask = getMask()) {
Mask = State.get(VPMask);
- if (isReverse())
- Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
} else {
Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
}
@@ -2641,17 +2628,29 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
nullptr, "wide.masked.gather");
} else {
- VectorBuilder VBuilder(Builder);
- VBuilder.setEVL(EVL).setMask(Mask);
- NewLI = cast<CallInst>(VBuilder.createVectorInstruction(
- Instruction::Load, DataTy, Addr, "vp.op.load"));
+ if (isReverse()) {
+ auto *EltTy = DataTy->getElementType();
+ auto *PtrTy = Addr->getType();
+ Value *Operands[] = {
+ Addr,
+ ConstantInt::getSigned(
+ Builder.getInt32Ty(),
+ -static_cast<int64_t>(EltTy->getScalarSizeInBits()) / 8),
+ Mask, EVL};
+ NewLI = Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_load,
+ {DataTy, PtrTy, Builder.getInt32Ty()},
+ Operands, nullptr, "vp.neg.strided.load");
+ } else {
+ VectorBuilder VBuilder(Builder);
+ VBuilder.setEVL(EVL).setMask(Mask);
+ NewLI = cast<CallInst>(VBuilder.createVectorInstruction(
+ Instruction::Load, DataTy, Addr, "vp.op.load"));
+ }
}
NewLI->addParamAttr(
0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
State.addMetadata(NewLI, LI);
Instruction *Res = NewLI;
- if (isReverse())
- Res = createReverseEVL(Builder, Res, EVL, "vp.reverse");
State.set(this, Res);
}
@@ -2749,13 +2748,9 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
CallInst *NewSI = nullptr;
Value *StoredVal = State.get(StoredValue);
Value *EVL = State.get(getEVL(), VPLane(0));
- if (isReverse())
- StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse");
Value *Mask = nullptr;
if (VPValue *VPMask = getMask()) {
Mask = State.get(VPMask);
- if (isReverse())
- Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
} else {
Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
}
@@ -2765,11 +2760,26 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
Intrinsic::vp_scatter,
{StoredVal, Addr, Mask, EVL});
} else {
- VectorBuilder VBuilder(Builder);
- VBuilder.setEVL(EVL).setMask(Mask);
- NewSI = cast<CallInst>(VBuilder.createVectorInstruction(
- Instruction::Store, Type::getVoidTy(EVL->getContext()),
- {StoredVal, Addr}));
+ if (isReverse()) {
+ Type *StoredValTy = StoredVal->getType();
+ auto *EltTy = cast<VectorType>(StoredValTy)->getElementType();
+ auto *PtrTy = Addr->getType();
+ Value *Operands[] = {
+ StoredVal, Addr,
+ ConstantInt::getSigned(
+ Builder.getInt32Ty(),
+ -static_cast<int64_t>(EltTy->getScalarSizeInBits()) / 8),
+ Mask, EVL};
+ NewSI = Builder.CreateIntrinsic(
+ Intrinsic::experimental_vp_strided_store,
+ {StoredValTy, PtrTy, Builder.getInt32Ty()}, Operands);
+ } else {
+ VectorBuilder VBuilder(Builder);
+ VBuilder.setEVL(EVL).setMask(Mask);
+ NewSI = cast<CallInst>(VBuilder.createVectorInstruction(
+ Instruction::Store, Type::getVoidTy(EVL->getContext()),
+ {StoredVal, Addr}));
+ }
}
NewSI->addParamAttr(
1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
index 5b579b0749c677..ba65137e94935c 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
@@ -39,16 +39,14 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt
; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 1, [[TMP18]]
; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[TMP8]], i64 [[TMP9]]
; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP10]]
-; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
+; IF-EVL-NEXT: [[VP_NEG_STRIDED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i32(ptr align 4 [[TMP12]], i32 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP7]]
; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP5]] to i64
; IF-EVL-NEXT: [[TMP14:%.*]] = mul i64 0, [[TMP19]]
; IF-EVL-NEXT: [[TMP15:%.*]] = sub i64 1, [[TMP19]]
; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[TMP13]], i64 [[TMP14]]
; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[TMP22]], i64 [[TMP15]]
-; IF-EVL-NEXT: [[VP_REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE3]], ptr align 4 [[TMP17]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
+; IF-EVL-NEXT: call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i32(<vscale x 4 x i32> [[VP_NEG_STRIDED_LOAD]], ptr align 4 [[TMP17]], i32 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP5]] to i64
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP20]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
@@ -153,18 +151,14 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal
; IF-EVL-NEXT: [[TMP18:%.*]] = sub i64 1, [[TMP26]]
; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP17]]
; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[TMP19]], i64 [[TMP18]]
-; IF-EVL-NEXT: [[VP_REVERSE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT: [[VP_OP_LOAD4:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP20]], <vscale x 4 x i1> [[VP_REVERSE_MASK]], i32 [[TMP5]])
-; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD4]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
+; IF-EVL-NEXT: [[VP_NEG_STRIDED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i32(ptr align 4 [[TMP20]], i32 -4, <vscale x 4 x i1> [[TMP15]], i32 [[TMP5]])
; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP11]]
; IF-EVL-NEXT: [[TMP27:%.*]] = zext i32 [[TMP5]] to i64
; IF-EVL-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP27]]
; IF-EVL-NEXT: [[TMP23:%.*]] = sub i64 1, [[TMP27]]
; IF-EVL-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr [[TMP21]], i64 [[TMP22]]
; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP24]], i64 [[TMP23]]
-; IF-EVL-NEXT: [[VP_REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT: [[VP_REVERSE_MASK6:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE5]], ptr align 4 [[TMP25]], <vscale x 4 x i1> [[VP_REVERSE_MASK6]], i32 [[TMP5]])
+; IF-EVL-NEXT: call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i32(<vscale x 4 x i32> [[VP_NEG_STRIDED_LOAD]], ptr align 4 [[TMP25]], i32 -4, <vscale x 4 x i1> [[TMP15]], i32 [[TMP5]])
; IF-EVL-NEXT: [[TMP28:%.*]] = zext i32 [[TMP5]] to i64
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP28]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
@@ -280,8 +274,7 @@ define void @multiple_reverse_vector_pointer(ptr noalias %a, ptr noalias %b, ptr
; IF-EVL-NEXT: [[TMP11:%.*]] = sub i64 1, [[TMP9]]
; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP8]], i64 [[TMP10]]
; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i64 [[TMP11]]
-; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP13]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
-; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 16 x i8> @llvm.experimental.vp.reverse.nxv16i8(<vscale x 16 x i8> [[VP_OP_LOAD]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
+; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 16 x i8> @llvm.experimental.vp.strided.load.nxv16i8.p0.i32(ptr align 1 [[TMP13]], i32 -1, <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B:%.*]], <vscale x 16 x i8> [[VP_REVERSE]]
; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP14]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[TMP7]]
@@ -290,16 +283,14 @@ define void @multiple_reverse_vector_pointer(ptr noalias %a, ptr noalias %b, ptr
; IF-EVL-NEXT: [[TMP18:%.*]] = sub i64 1, [[TMP16]]
; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[TMP15]], i64 [[TMP17]]
; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[TMP19]], i64 [[TMP18]]
-; IF-EVL-NEXT: [[VP_REVERSE1:%.*]] = call <vscale x 16 x i8> @llvm.experimental.vp.reverse.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
-; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[VP_REVERSE1]], ptr align 1 [[TMP20]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
+; IF-EVL-NEXT: call void @llvm.experimental.vp.strided.store.nxv16i8.p0.i32(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], ptr align 1 [[TMP20]], i32 -1, <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[D:%.*]], i64 [[TMP7]]
; IF-EVL-NEXT: [[TMP22:%.*]] = zext i32 [[TMP6]] to i64
; IF-EVL-NEXT: [[TMP23:%.*]] = mul i64 0, [[TMP22]]
; IF-EVL-NEXT: [[TMP24:%.*]] = sub i64 1, [[TMP22]]
; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr [[TMP21]], i64 [[TMP23]]
; IF-EVL-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[TMP25]], i64 [[TMP24]]
-; IF-EVL-NEXT: [[VP_REVERSE2:%.*]] = call <vscale x 16 x i8> @llvm.experimental.vp.reverse.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
-; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[VP_REVERSE2]], ptr align 1 [[TMP26]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
+; IF-EVL-NEXT: call void @llvm.experimental.vp.strided.store.nxv16i8.p0.i32(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], ptr align 1 [[TMP26]], i32 -1, <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
; IF-EVL-NEXT: [[TMP27:%.*]] = zext i32 [[TMP6]] to i64
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP27]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
index a2f85b9ed4ffe1..69ba0bad45de6a 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
@@ -43,8 +43,7 @@ define void @lshift_significand(i32 %n, ptr nocapture writeonly %dst) {
; CHECK-NEXT: [[TMP18:%.*]] = sub i64 1, [[TMP15]]
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i64, ptr [[TMP14]], i64 [[TMP17]]
; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i64, ptr [[TMP19]], i64 [[TMP18]]
-; CHECK-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.reverse.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
-; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_REVERSE]], ptr align 8 [[TMP20]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
+; CHECK-NEXT: call void @llvm.experimental.vp.strided.store.nxv2i64.p0.i32(<vscale x 2 x i64> zeroinitializer, ptr align 8 [[TMP20]], i32 -8, <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP11]] to i64
; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
|
Is it correctly represented in the cost model? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, this is much preferred to having to lower experimental_vp_reverse in the risc-v backend.
It looks like this also removes the only use of llvm.experimental.vp.reverse. Do we think we will need it for anything else in the EVL line of work? If not could we eventually end up removing it?
There is another potential issue. Some targets might not support strided operations, so would be good to add a check and add new flag(possibly?) in load/stores recipes for strided ops |
I don't think I'm too concerned about other targets since as far as I'm aware RISC-V is the only target that can lower llvm.experimental.vp.reverse today. And we don't seem to currently check if it's supported anyway? |
Today - maybe. But EVL vectorizer is not RISCV only (generally speaking!), so need to provide full correctness here for all potential targets.
Yes, because we never emit strided ops in LV for now. To emit it, need to correctly implement legality checks and cost model |
I meant as in we don't check if llvm.experimental.vp.reverse is supported, and it doesn't look like there's a default expansion for it |
It is not correct and should be checked |
I am not so familiar with current LV infrastructure, how should I check the legality and cost model here? |
This can reduce the operations to reverse mask, load result and store value.
3a6a226
to
f4f50e6
Compare
In If it's not supported then the cost should return illegal which should prevent the plan from being selected I think |
Thanks! I think I have followed your suggestion and ported the new cost model. But it seems that the legacy cost model and VPlan-based cost model have different decisions of best factor. I debugged it and I don't think there is anything wrong. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM. Probably best to wait for @fhahn or @alexey-bataev to review it too
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am wondering about the necessity of emitting vp.stride_load/store in the vectorizer stage.
Can we convert load + reverse/ reverse + store to strided load/store in CodeGenPrepare? Similar functionality to RISC-V gather/scatter lowering and Interleaved Access Pass.
https://godbolt.org/z/rc3zPqrez
One benefit of emitting strided intrinsics earlier is that it allows us to use the strided memory op TTI hook in the cost model. Otherwise we would have to conservatively cost it as a wide load + reverse, which IIRC the reverse is still costed as a quadratic vrgather today |
Are there more comments? I hope I can land this before my Chinese New Year holidays. :-) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks
A more precise cost model sounds like a good reason. Another one is enum InstWidening, which is usually where decisions on how to widen memory accesses are made. It might be helpful. |
I guess it depends on what we want the canonical form for a reversed load to be, i.e. a vp intrinsic or load + reverse. My preference would be the former since it's easier to pattern match. Although I didn't realise this would be unprofitable on x280, in that case I think its reasonable if we want to check the cost of |
return Ctx.TTI.getStridedMemoryOpCost(Ingredient.getOpcode(), Ty, | ||
getAddr()->getUnderlyingValue(), false, | ||
Alignment, Ctx.CostKind); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What will happen if getStridedMemoryOpCost
return Invalid
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Then the cost of VPlan will be invalid and we emit a marker?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Emitting Invalid in computeCost will cause the plan to be discarded, making vectorization impossible. At this stage, if we discover that the target does not support strided memory accesses, it is too late to fall back to using widen load + reverse for vectorizing reverse accesses.
Additionally, on targets where strided memory accesses have worse performance, this would increase the vectorization cost, potentially leading to vectorization being abandoned.
Sounds good :) |
For current vectorizer framework, I think it is not easy to implement choosing lowering methods based on cost. We compute the cost for different |
No, the current framework can handle this. This decision is made by Additionally, replacing reverse memory accesses with strided memory accesses using a stride of -1 is beneficial not only when tail folding by EVL is enabled but also for other tail folding styles. It may even provide benefits for fixed VF. I believe this optimization should not be restricted to EVL vectorization. |
I think you could also implement this by optimize/transform vplan recipes and clamp the VF range based on cost model in the early stage. And lower to concrete recipes if needed just before execution. |
llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
Show resolved
Hide resolved
Ping! |
#128718 |
Is this related to #118638? |
Hi @wangpc-pp, This patch is good but I think we need a cost model based decision to decide using negative stride or reverse vector. We've test the performance on x280 for negative-stride load vs. vector load + reverse. And the results shows negative-stride load 10% more slower than reverse. |
Thanks for the provided information about x280! I will wait until @Mel-Chen has finished her CM_Strided patches and see where we can reach. |
Yes, they are currently related, but ultimately, the strided memory pointer should not depend on VPWidenIntOrFpInductionRecipe, as strided accesses only require a scalar address, unlike gather/scatter, which require a vectorized address. However, supporting VPWidenIntOrFpInductionRecipe for tail folding with EVL remains crucial. This would enable the vectorization of non-unit stride strided accesses under folding by EVL, which I believe would be a significant improvement. https://godbolt.org/z/YTxdhqcWe I will start reviewing #118638 next week since this Friday is a holiday at my location:) |
|
||
if (CreateGather) { | ||
NewLI = | ||
Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL}, | ||
nullptr, "wide.masked.gather"); | ||
} else if (isReverse()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I have test TSCV
- use your patch, the s1112, time is from 24 to19
- For reverse iterative continuous load and store access, I think it can be handled by using unit load/store by updating the base address. I haven't figured out if there is anything wrong with this, please let me know if there is. These are some changes I made on your MR, can you take a look? :https://github.com/llvm/llvm-project/pull/130032/files
This can reduce the operations to reverse mask, load result
and store value.