[LV][EVL] Generate negative strided load/store for reversed load/store #123608

wangpc-pp · 2025-01-20T13:01:05Z

This can reduce the operations to reverse mask, load result
and store value.

llvmbot · 2025-01-20T13:01:41Z

@llvm/pr-subscribers-backend-risc-v
@llvm/pr-subscribers-llvm-analysis
@llvm/pr-subscribers-vectorizers

@llvm/pr-subscribers-llvm-transforms

Author: Pengcheng Wang (wangpc-pp)

Changes

This can reduce the operations to reverse mask, load result
and store value.

Full diff: https://github.com/llvm/llvm-project/pull/123608.diff

3 Files Affected:

(modified) llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (+38-28)
(modified) llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll (+7-16)
(modified) llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll (+1-2)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index aa5f92b235555e..587c7e9b4417fa 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2603,17 +2603,6 @@ void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent,
 }
 #endif
 
-/// Use all-true mask for reverse rather than actual mask, as it avoids a
-/// dependence w/o affecting the result.
-static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand,
-                                     Value *EVL, const Twine &Name) {
-  VectorType *ValTy = cast<VectorType>(Operand->getType());
-  Value *AllTrueMask =
-      Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue());
-  return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse,
-                                 {Operand, AllTrueMask, EVL}, nullptr, Name);
-}
-
 void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
   auto *LI = cast<LoadInst>(&Ingredient);
 
@@ -2630,8 +2619,6 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
   Value *Mask = nullptr;
   if (VPValue *VPMask = getMask()) {
     Mask = State.get(VPMask);
-    if (isReverse())
-      Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
   } else {
     Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
   }
@@ -2641,17 +2628,29 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
         Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
                                 nullptr, "wide.masked.gather");
   } else {
-    VectorBuilder VBuilder(Builder);
-    VBuilder.setEVL(EVL).setMask(Mask);
-    NewLI = cast<CallInst>(VBuilder.createVectorInstruction(
-        Instruction::Load, DataTy, Addr, "vp.op.load"));
+    if (isReverse()) {
+      auto *EltTy = DataTy->getElementType();
+      auto *PtrTy = Addr->getType();
+      Value *Operands[] = {
+          Addr,
+          ConstantInt::getSigned(
+              Builder.getInt32Ty(),
+              -static_cast<int64_t>(EltTy->getScalarSizeInBits()) / 8),
+          Mask, EVL};
+      NewLI = Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_load,
+                                      {DataTy, PtrTy, Builder.getInt32Ty()},
+                                      Operands, nullptr, "vp.neg.strided.load");
+    } else {
+      VectorBuilder VBuilder(Builder);
+      VBuilder.setEVL(EVL).setMask(Mask);
+      NewLI = cast<CallInst>(VBuilder.createVectorInstruction(
+          Instruction::Load, DataTy, Addr, "vp.op.load"));
+    }
   }
   NewLI->addParamAttr(
       0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
   State.addMetadata(NewLI, LI);
   Instruction *Res = NewLI;
-  if (isReverse())
-    Res = createReverseEVL(Builder, Res, EVL, "vp.reverse");
   State.set(this, Res);
 }
 
@@ -2749,13 +2748,9 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
   CallInst *NewSI = nullptr;
   Value *StoredVal = State.get(StoredValue);
   Value *EVL = State.get(getEVL(), VPLane(0));
-  if (isReverse())
-    StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse");
   Value *Mask = nullptr;
   if (VPValue *VPMask = getMask()) {
     Mask = State.get(VPMask);
-    if (isReverse())
-      Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
   } else {
     Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
   }
@@ -2765,11 +2760,26 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
                                     Intrinsic::vp_scatter,
                                     {StoredVal, Addr, Mask, EVL});
   } else {
-    VectorBuilder VBuilder(Builder);
-    VBuilder.setEVL(EVL).setMask(Mask);
-    NewSI = cast<CallInst>(VBuilder.createVectorInstruction(
-        Instruction::Store, Type::getVoidTy(EVL->getContext()),
-        {StoredVal, Addr}));
+    if (isReverse()) {
+      Type *StoredValTy = StoredVal->getType();
+      auto *EltTy = cast<VectorType>(StoredValTy)->getElementType();
+      auto *PtrTy = Addr->getType();
+      Value *Operands[] = {
+          StoredVal, Addr,
+          ConstantInt::getSigned(
+              Builder.getInt32Ty(),
+              -static_cast<int64_t>(EltTy->getScalarSizeInBits()) / 8),
+          Mask, EVL};
+      NewSI = Builder.CreateIntrinsic(
+          Intrinsic::experimental_vp_strided_store,
+          {StoredValTy, PtrTy, Builder.getInt32Ty()}, Operands);
+    } else {
+      VectorBuilder VBuilder(Builder);
+      VBuilder.setEVL(EVL).setMask(Mask);
+      NewSI = cast<CallInst>(VBuilder.createVectorInstruction(
+          Instruction::Store, Type::getVoidTy(EVL->getContext()),
+          {StoredVal, Addr}));
+    }
   }
   NewSI->addParamAttr(
       1, Attribute::getWithAlignment(NewSI->getContext(), Alignment));
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
index 5b579b0749c677..ba65137e94935c 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll
@@ -39,16 +39,14 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt
 ; IF-EVL-NEXT:    [[TMP10:%.*]] = sub i64 1, [[TMP18]]
 ; IF-EVL-NEXT:    [[TMP16:%.*]] = getelementptr i32, ptr [[TMP8]], i64 [[TMP9]]
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP10]]
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT:    [[VP_REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
+; IF-EVL-NEXT:    [[VP_NEG_STRIDED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i32(ptr align 4 [[TMP12]], i32 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP7]]
 ; IF-EVL-NEXT:    [[TMP19:%.*]] = zext i32 [[TMP5]] to i64
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = mul i64 0, [[TMP19]]
 ; IF-EVL-NEXT:    [[TMP15:%.*]] = sub i64 1, [[TMP19]]
 ; IF-EVL-NEXT:    [[TMP22:%.*]] = getelementptr i32, ptr [[TMP13]], i64 [[TMP14]]
 ; IF-EVL-NEXT:    [[TMP17:%.*]] = getelementptr i32, ptr [[TMP22]], i64 [[TMP15]]
-; IF-EVL-NEXT:    [[VP_REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE3]], ptr align 4 [[TMP17]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
+; IF-EVL-NEXT:    call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i32(<vscale x 4 x i32> [[VP_NEG_STRIDED_LOAD]], ptr align 4 [[TMP17]], i32 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[TMP20:%.*]] = zext i32 [[TMP5]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP20]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
@@ -153,18 +151,14 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = sub i64 1, [[TMP26]]
 ; IF-EVL-NEXT:    [[TMP19:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP17]]
 ; IF-EVL-NEXT:    [[TMP20:%.*]] = getelementptr i32, ptr [[TMP19]], i64 [[TMP18]]
-; IF-EVL-NEXT:    [[VP_REVERSE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT:    [[VP_OP_LOAD4:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP20]], <vscale x 4 x i1> [[VP_REVERSE_MASK]], i32 [[TMP5]])
-; IF-EVL-NEXT:    [[VP_REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD4]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
+; IF-EVL-NEXT:    [[VP_NEG_STRIDED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i32(ptr align 4 [[TMP20]], i32 -4, <vscale x 4 x i1> [[TMP15]], i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[TMP21:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP11]]
 ; IF-EVL-NEXT:    [[TMP27:%.*]] = zext i32 [[TMP5]] to i64
 ; IF-EVL-NEXT:    [[TMP22:%.*]] = mul i64 0, [[TMP27]]
 ; IF-EVL-NEXT:    [[TMP23:%.*]] = sub i64 1, [[TMP27]]
 ; IF-EVL-NEXT:    [[TMP24:%.*]] = getelementptr i32, ptr [[TMP21]], i64 [[TMP22]]
 ; IF-EVL-NEXT:    [[TMP25:%.*]] = getelementptr i32, ptr [[TMP24]], i64 [[TMP23]]
-; IF-EVL-NEXT:    [[VP_REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT:    [[VP_REVERSE_MASK6:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE5]], ptr align 4 [[TMP25]], <vscale x 4 x i1> [[VP_REVERSE_MASK6]], i32 [[TMP5]])
+; IF-EVL-NEXT:    call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i32(<vscale x 4 x i32> [[VP_NEG_STRIDED_LOAD]], ptr align 4 [[TMP25]], i32 -4, <vscale x 4 x i1> [[TMP15]], i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[TMP28:%.*]] = zext i32 [[TMP5]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP28]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
@@ -280,8 +274,7 @@ define void @multiple_reverse_vector_pointer(ptr noalias %a, ptr noalias %b, ptr
 ; IF-EVL-NEXT:    [[TMP11:%.*]] = sub i64 1, [[TMP9]]
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[TMP8]], i64 [[TMP10]]
 ; IF-EVL-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i64 [[TMP11]]
-; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP13]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
-; IF-EVL-NEXT:    [[VP_REVERSE:%.*]] = call <vscale x 16 x i8> @llvm.experimental.vp.reverse.nxv16i8(<vscale x 16 x i8> [[VP_OP_LOAD]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
+; IF-EVL-NEXT:    [[VP_REVERSE:%.*]] = call <vscale x 16 x i8> @llvm.experimental.vp.strided.load.nxv16i8.p0.i32(ptr align 1 [[TMP13]], i32 -1, <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[B:%.*]], <vscale x 16 x i8> [[VP_REVERSE]]
 ; IF-EVL-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP14]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
 ; IF-EVL-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[TMP7]]
@@ -290,16 +283,14 @@ define void @multiple_reverse_vector_pointer(ptr noalias %a, ptr noalias %b, ptr
 ; IF-EVL-NEXT:    [[TMP18:%.*]] = sub i64 1, [[TMP16]]
 ; IF-EVL-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[TMP15]], i64 [[TMP17]]
 ; IF-EVL-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[TMP19]], i64 [[TMP18]]
-; IF-EVL-NEXT:    [[VP_REVERSE1:%.*]] = call <vscale x 16 x i8> @llvm.experimental.vp.reverse.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[VP_REVERSE1]], ptr align 1 [[TMP20]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
+; IF-EVL-NEXT:    call void @llvm.experimental.vp.strided.store.nxv16i8.p0.i32(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], ptr align 1 [[TMP20]], i32 -1, <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
 ; IF-EVL-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[D:%.*]], i64 [[TMP7]]
 ; IF-EVL-NEXT:    [[TMP22:%.*]] = zext i32 [[TMP6]] to i64
 ; IF-EVL-NEXT:    [[TMP23:%.*]] = mul i64 0, [[TMP22]]
 ; IF-EVL-NEXT:    [[TMP24:%.*]] = sub i64 1, [[TMP22]]
 ; IF-EVL-NEXT:    [[TMP25:%.*]] = getelementptr i8, ptr [[TMP21]], i64 [[TMP23]]
 ; IF-EVL-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[TMP25]], i64 [[TMP24]]
-; IF-EVL-NEXT:    [[VP_REVERSE2:%.*]] = call <vscale x 16 x i8> @llvm.experimental.vp.reverse.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
-; IF-EVL-NEXT:    call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[VP_REVERSE2]], ptr align 1 [[TMP26]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
+; IF-EVL-NEXT:    call void @llvm.experimental.vp.strided.store.nxv16i8.p0.i32(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], ptr align 1 [[TMP26]], i32 -1, <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
 ; IF-EVL-NEXT:    [[TMP27:%.*]] = zext i32 [[TMP6]] to i64
 ; IF-EVL-NEXT:    [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP27]], [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
index a2f85b9ed4ffe1..69ba0bad45de6a 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-uniform-store.ll
@@ -43,8 +43,7 @@ define void @lshift_significand(i32 %n, ptr nocapture writeonly %dst) {
 ; CHECK-NEXT:    [[TMP18:%.*]] = sub i64 1, [[TMP15]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i64, ptr [[TMP14]], i64 [[TMP17]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i64, ptr [[TMP19]], i64 [[TMP18]]
-; CHECK-NEXT:    [[VP_REVERSE:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.reverse.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
-; CHECK-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_REVERSE]], ptr align 8 [[TMP20]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
+; CHECK-NEXT:    call void @llvm.experimental.vp.strided.store.nxv2i64.p0.i32(<vscale x 2 x i64> zeroinitializer, ptr align 8 [[TMP20]], i32 -8, <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
 ; CHECK-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP11]] to i64
 ; CHECK-NEXT:    [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]

alexey-bataev · 2025-01-20T14:19:23Z

Is it correctly represented in the cost model?

lukel97

Thanks, this is much preferred to having to lower experimental_vp_reverse in the risc-v backend.

It looks like this also removes the only use of llvm.experimental.vp.reverse. Do we think we will need it for anything else in the EVL line of work? If not could we eventually end up removing it?

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

alexey-bataev · 2025-01-20T14:38:03Z

There is another potential issue. Some targets might not support strided operations, so would be good to add a check and add new flag(possibly?) in load/stores recipes for strided ops

lukel97 · 2025-01-20T15:11:41Z

There is another potential issue. Some targets might not support strided operations, so would be good to add a check and add new flag(possibly?) in load/stores recipes for strided ops

I don't think I'm too concerned about other targets since as far as I'm aware RISC-V is the only target that can lower llvm.experimental.vp.reverse today. And we don't seem to currently check if it's supported anyway?

alexey-bataev · 2025-01-20T15:20:19Z

There is another potential issue. Some targets might not support strided operations, so would be good to add a check and add new flag(possibly?) in load/stores recipes for strided ops

I don't think I'm too concerned about other targets since as far as I'm aware RISC-V is the only target that can lower llvm.experimental.vp.reverse today.

Today - maybe. But EVL vectorizer is not RISCV only (generally speaking!), so need to provide full correctness here for all potential targets.

And we don't seem to currently check if it's supported anyway?

Yes, because we never emit strided ops in LV for now. To emit it, need to correctly implement legality checks and cost model

lukel97 · 2025-01-20T16:51:43Z

And we don't seem to currently check if it's supported anyway?

Yes, because we never emit strided ops in LV for now. To emit it, need to correctly implement legality checks and cost model

I meant as in we don't check if llvm.experimental.vp.reverse is supported, and it doesn't look like there's a default expansion for it

alexey-bataev · 2025-01-20T17:56:15Z

And we don't seem to currently check if it's supported anyway?

Yes, because we never emit strided ops in LV for now. To emit it, need to correctly implement legality checks and cost model

I meant as in we don't check if llvm.experimental.vp.reverse is supported, and it doesn't look like there's a default expansion for it

It is not correct and should be checked

wangpc-pp · 2025-01-21T03:38:24Z

And we don't seem to currently check if it's supported anyway?

Yes, because we never emit strided ops in LV for now. To emit it, need to correctly implement legality checks and cost model

I meant as in we don't check if llvm.experimental.vp.reverse is supported, and it doesn't look like there's a default expansion for it

It is not correct and should be checked

I am not so familiar with current LV infrastructure, how should I check the legality and cost model here?

This can reduce the operations to reverse mask, load result and store value.

lukel97 · 2025-01-21T04:00:49Z

I am not so familiar with current LV infrastructure, how should I check the legality and cost model here?

In VPWidenLoadEVLRecipe::computeCost you can remove the getShuffleCost bit and use TTI.getStridedMemOpCost.

If it's not supported then the cost should return illegal which should prevent the plan from being selected I think

wangpc-pp · 2025-01-21T08:09:45Z

I am not so familiar with current LV infrastructure, how should I check the legality and cost model here?

In VPWidenLoadEVLRecipe::computeCost you can remove the getShuffleCost bit and use TTI.getStridedMemOpCost.

If it's not supported then the cost should return illegal which should prevent the plan from being selected I think

Thanks! I think I have followed your suggestion and ported the new cost model. But it seems that the legacy cost model and VPlan-based cost model have different decisions of best factor. I debugged it and I don't think there is anything wrong.

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

lukel97

LGTM. Probably best to wait for @fhahn or @alexey-bataev to review it too

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Mel-Chen

I am wondering about the necessity of emitting vp.stride_load/store in the vectorizer stage.
Can we convert load + reverse/ reverse + store to strided load/store in CodeGenPrepare? Similar functionality to RISC-V gather/scatter lowering and Interleaved Access Pass.
https://godbolt.org/z/rc3zPqrez

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

lukel97 · 2025-01-21T17:33:54Z

I am wondering about the necessity of emitting vp.stride_load/store in the vectorizer stage.

One benefit of emitting strided intrinsics earlier is that it allows us to use the strided memory op TTI hook in the cost model.

Otherwise we would have to conservatively cost it as a wide load + reverse, which IIRC the reverse is still costed as a quadratic vrgather today

wangpc-pp · 2025-01-24T03:14:47Z

Are there more comments? I hope I can land this before my Chinese New Year holidays. :-)

fhahn

LGTM, thanks

Mel-Chen · 2025-02-04T13:17:56Z

I am wondering about the necessity of emitting vp.stride_load/store in the vectorizer stage.

One benefit of emitting strided intrinsics earlier is that it allows us to use the strided memory op TTI hook in the cost model.

Otherwise we would have to conservatively cost it as a wide load + reverse, which IIRC the reverse is still costed as a quadratic vrgather today

A more precise cost model sounds like a good reason.
But if that's the case, wouldn't it be sufficient to just modify computeCost, keep load + reverse, and convert it into strided accesses in CodeGenPrepare? This would also benefit hand-written LLVM IR.

Another one is enum InstWidening, which is usually where decisions on how to widen memory accesses are made. It might be helpful.

lukel97 · 2025-02-04T13:56:20Z

But if that's the case, wouldn't it be sufficient to just modify computeCost, keep load + reverse, and convert it into strided accesses in CodeGenPrepare?

I guess it depends on what we want the canonical form for a reversed load to be, i.e. a vp intrinsic or load + reverse. My preference would be the former since it's easier to pattern match.

Although I didn't realise this would be unprofitable on x280, in that case I think its reasonable if we want to check the cost of getStridedMemoryOpCost vs getMemoryOpCost + getShuffleCost and choose the cheaper lowering option as @arcbbb suggested earlier. That way the cost for the recipe would just be the minimum of the 2.

Mel-Chen · 2025-02-04T13:25:01Z

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

+  return Ctx.TTI.getStridedMemoryOpCost(Ingredient.getOpcode(), Ty,
+                                        getAddr()->getUnderlyingValue(), false,
+                                        Alignment, Ctx.CostKind);


What will happen if getStridedMemoryOpCost return Invalid?

Then the cost of VPlan will be invalid and we emit a marker?

Emitting Invalid in computeCost will cause the plan to be discarded, making vectorization impossible. At this stage, if we discover that the target does not support strided memory accesses, it is too late to fall back to using widen load + reverse for vectorizing reverse accesses.

Additionally, on targets where strided memory accesses have worse performance, this would increase the vectorization cost, potentially leading to vectorization being abandoned.

Mel-Chen · 2025-02-05T07:29:03Z

But if that's the case, wouldn't it be sufficient to just modify computeCost, keep load + reverse, and convert it into strided accesses in CodeGenPrepare?

I guess it depends on what we want the canonical form for a reversed load to be, i.e. a vp intrinsic or load + reverse. My preference would be the former since it's easier to pattern match.

Although I didn't realise this would be unprofitable on x280, in that case I think its reasonable if we want to check the cost of getStridedMemoryOpCost vs getMemoryOpCost + getShuffleCost and choose the cheaper lowering option as @arcbbb suggested earlier. That way the cost for the recipe would just be the minimum of the 2.

Sounds good :)

wangpc-pp · 2025-02-06T09:02:52Z

But if that's the case, wouldn't it be sufficient to just modify computeCost, keep load + reverse, and convert it into strided accesses in CodeGenPrepare?

I guess it depends on what we want the canonical form for a reversed load to be, i.e. a vp intrinsic or load + reverse. My preference would be the former since it's easier to pattern match.
Although I didn't realise this would be unprofitable on x280, in that case I think its reasonable if we want to check the cost of getStridedMemoryOpCost vs getMemoryOpCost + getShuffleCost and choose the cheaper lowering option as @arcbbb suggested earlier. That way the cost for the recipe would just be the minimum of the 2.

Sounds good :)

For current vectorizer framework, I think it is not easy to implement choosing lowering methods based on cost. We compute the cost for different VF and CostKind but when executing the VPlan, the VF has been choosen. There is a gap between cost modeling and VPlan's execution. We may add a map (VF/CostKind -> lowering method) to Recipe and query the decision for execution VF. I'd like to do such thing in a future follow-up. :-)

Mel-Chen · 2025-02-07T07:21:48Z

For current vectorizer framework, I think it is not easy to implement choosing lowering methods based on cost.

No, the current framework can handle this. This decision is made by LoopVectorizationCostModel::setCostBasedWideningDecision based on cost.

Additionally, replacing reverse memory accesses with strided memory accesses using a stride of -1 is beneficial not only when tail folding by EVL is enabled but also for other tail folding styles. It may even provide benefits for fixed VF. I believe this optimization should not be restricted to EVL vectorization.

ElvisWang123 · 2025-02-07T07:44:33Z

I think you could also implement this by optimize/transform vplan recipes and clamp the VF range based on cost model in the early stage. And lower to concrete recipes if needed just before execution.
This is similar to #113903.

llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll

wangpc-pp · 2025-02-25T07:47:31Z

Ping!

Mel-Chen · 2025-02-25T14:37:52Z

#128718
I use instWidening before VPlan generation to decide between reverse or strided accesses based on cost. This benefits vectorization regardless of whether tail folding with EVL is enabled.
Currently, due to the existing dependency on VPWidenIntOrFpInductionRecipe, the VPlan for tail folding with EVL will be bailed out. I will work on eliminating this dependency to resolve the issue this week.

lukel97 · 2025-02-26T08:35:57Z

Currently, due to the existing dependency on VPWidenIntOrFpInductionRecipe, the VPlan for tail folding with EVL will be bailed out. I will work on eliminating this dependency to resolve the issue this week.

Is this related to #118638?

ElvisWang123 · 2025-02-27T04:51:12Z

Hi @wangpc-pp,

This patch is good but I think we need a cost model based decision to decide using negative stride or reverse vector.

We've test the performance on x280 for negative-stride load vs. vector load + reverse. And the results shows negative-stride load 10% more slower than reverse.

wangpc-pp · 2025-02-27T07:38:38Z

Hi @wangpc-pp,

This patch is good but I think we need a cost model based decision to decide using negative stride or reverse vector.

We've test the performance on x280 for negative-stride load vs. vector load + reverse. And the results shows negative-stride load 10% more slower than reverse.

Thanks for the provided information about x280! I will wait until @Mel-Chen has finished her CM_Strided patches and see where we can reach.

Mel-Chen · 2025-02-27T11:35:30Z

Currently, due to the existing dependency on VPWidenIntOrFpInductionRecipe, the VPlan for tail folding with EVL will be bailed out. I will work on eliminating this dependency to resolve the issue this week.

Is this related to #118638?

Yes, they are currently related, but ultimately, the strided memory pointer should not depend on VPWidenIntOrFpInductionRecipe, as strided accesses only require a scalar address, unlike gather/scatter, which require a vectorized address.

However, supporting VPWidenIntOrFpInductionRecipe for tail folding with EVL remains crucial. This would enable the vectorization of non-unit stride strided accesses under folding by EVL, which I believe would be a significant improvement. https://godbolt.org/z/YTxdhqcWe

I will start reviewing #118638 next week since this Friday is a holiday at my location:)

LiqinWeng · 2025-03-06T09:44:44Z

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp


  if (CreateGather) {
    NewLI =
        Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
                                nullptr, "wide.masked.gather");
+  } else if (isReverse()) {


I have test TSCV

use your patch， the s1112, time is from 24 to19

For reverse iterative continuous load and store access, I think it can be handled by using unit load/store by updating the base address. I haven't figured out if there is anything wrong with this, please let me know if there is. These are some changes I made on your MR, can you take a look? ：https://github.com/llvm/llvm-project/pull/130032/files

The TSCV test results are as follows：

wangpc-pp requested review from fhahn, ayalz and arcbbb and removed request for ayalz January 20, 2025 13:01

llvmbot added vectorizers llvm:transforms labels Jan 20, 2025

wangpc-pp requested a review from ayalz January 20, 2025 13:01

wangpc-pp requested review from lukel97, Mel-Chen and LiqinWeng January 20, 2025 13:02

lukel97 reviewed Jan 20, 2025

View reviewed changes

lukel97 mentioned this pull request Jan 20, 2025

[RISCV] Don't use EVL/Mask for vid when lowering vp.reverse #123048

Closed

wangpc-pp added 3 commits January 21, 2025 11:56

[LV][EVL] Generate negative strided load/store for reversed load/store

3f572c0

This can reduce the operations to reverse mask, load result and store value.

Use DL.getTypeAllocSize() and else if

627fbaa

Add TII::preferStridedLoadStore to control the behavior

f4f50e6

wangpc-pp force-pushed the main-vectorizer-evl-reverse-load-store-to-strided branch from 3a6a226 to f4f50e6 Compare January 21, 2025 03:58

llvmbot added backend:RISC-V llvm:analysis labels Jan 21, 2025

wangpc-pp added 2 commits January 21, 2025 12:03

Add a CLI option

65cd8a0

Port new cost model

830272f

Only generate strided load/store

e21dcdb

lukel97 reviewed Jan 21, 2025

View reviewed changes

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp Outdated Show resolved Hide resolved

Fix copy-paste mistake and check isReverse()

9562a3f

wangpc-pp requested a review from ElvisWang123 January 21, 2025 12:10

lukel97 approved these changes Jan 21, 2025

View reviewed changes

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp Show resolved Hide resolved

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp Show resolved Hide resolved

alexey-bataev reviewed Jan 21, 2025

View reviewed changes

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp Outdated Show resolved Hide resolved

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp Show resolved Hide resolved

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp Show resolved Hide resolved

Remove createReverseEVL and sink non-reversed Cost

4e4e4f2

wangpc-pp mentioned this pull request Jan 17, 2025

RISC-V EVL tail folding #123069

Open

16 tasks

Mel-Chen reviewed Jan 21, 2025

View reviewed changes

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp Outdated Show resolved Hide resolved

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp Outdated Show resolved Hide resolved

Change the name of strided load and use multiparameter isa

bd2f195

lukel97 mentioned this pull request Jan 31, 2025

rva22u64_v regression on 525.x264_r #124499

Closed

fhahn approved these changes Feb 3, 2025

View reviewed changes

Mel-Chen reviewed Feb 4, 2025

View reviewed changes

ElvisWang123 reviewed Feb 10, 2025

View reviewed changes

llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-reverse-load-store.ll Show resolved Hide resolved

Use right pointer

807de0d

LiqinWeng reviewed Mar 6, 2025

View reviewed changes

[LV][EVL] Generate negative strided load/store for reversed load/store #123608

Are you sure you want to change the base?

[LV][EVL] Generate negative strided load/store for reversed load/store #123608

Uh oh!

Conversation

wangpc-pp commented Jan 20, 2025

Uh oh!

llvmbot commented Jan 20, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

alexey-bataev commented Jan 20, 2025

Uh oh!

lukel97 left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

alexey-bataev commented Jan 20, 2025

Uh oh!

lukel97 commented Jan 20, 2025

Uh oh!

alexey-bataev commented Jan 20, 2025

Uh oh!

lukel97 commented Jan 20, 2025

Uh oh!

alexey-bataev commented Jan 20, 2025

Uh oh!

wangpc-pp commented Jan 21, 2025

Uh oh!

lukel97 commented Jan 21, 2025

Uh oh!

wangpc-pp commented Jan 21, 2025

Uh oh!

Uh oh!

lukel97 left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Mel-Chen left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

lukel97 commented Jan 21, 2025

Uh oh!

wangpc-pp commented Jan 24, 2025

Uh oh!

fhahn left a comment

Choose a reason for hiding this comment

Uh oh!

Mel-Chen commented Feb 4, 2025

Uh oh!

lukel97 commented Feb 4, 2025

Uh oh!

Mel-Chen Feb 4, 2025

Choose a reason for hiding this comment

Uh oh!

wangpc-pp Feb 6, 2025

Choose a reason for hiding this comment

Uh oh!

Mel-Chen Feb 7, 2025

Choose a reason for hiding this comment

Uh oh!

Mel-Chen commented Feb 5, 2025

Uh oh!

wangpc-pp commented Feb 6, 2025

Uh oh!

Mel-Chen commented Feb 7, 2025

Uh oh!

ElvisWang123 commented Feb 7, 2025

Uh oh!

Uh oh!

wangpc-pp commented Feb 25, 2025

Uh oh!

llvmbot commented Jan 20, 2025 •

edited

Loading

Mel-Chen commented Feb 27, 2025 •

edited

Loading