Skip to content

[LV][EVL] Generate negative strided load/store for reversed load/store #123608

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
Open
9 changes: 8 additions & 1 deletion llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7433,6 +7433,13 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
// comparing against the legacy cost isn't desirable.
if (isa<VPPartialReductionRecipe>(&R))
return true;

// The VPlan-based cost model may calculate the cost of strided load/store
// which can't be modeled in the legacy cost model.
if (isa<VPWidenLoadEVLRecipe, VPWidenStoreEVLRecipe>(&R))
if (cast<VPWidenMemoryRecipe>(&R)->isReverse())
return true;

if (Instruction *UI = GetInstructionForCost(&R))
SeenInstrs.insert(UI);
}
Expand Down Expand Up @@ -8281,7 +8288,7 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
auto *GEP = dyn_cast<GetElementPtrInst>(
Ptr->getUnderlyingValue()->stripPointerCasts());
VPSingleDefRecipe *VectorPtr;
if (Reverse) {
if (Reverse && !CM.foldTailWithEVL()) {
// When folding the tail, we may compute an address that we don't in the
// original scalar loop and it may not be inbounds. Drop Inbounds in that
// case.
Expand Down
75 changes: 38 additions & 37 deletions llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2603,17 +2603,6 @@ void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent,
}
#endif

/// Use all-true mask for reverse rather than actual mask, as it avoids a
/// dependence w/o affecting the result.
static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand,
Value *EVL, const Twine &Name) {
VectorType *ValTy = cast<VectorType>(Operand->getType());
Value *AllTrueMask =
Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue());
return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse,
{Operand, AllTrueMask, EVL}, nullptr, Name);
}

void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
auto *LI = cast<LoadInst>(&Ingredient);

Expand All @@ -2628,18 +2617,26 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
Value *EVL = State.get(getEVL(), VPLane(0));
Value *Addr = State.get(getAddr(), !CreateGather);
Value *Mask = nullptr;
if (VPValue *VPMask = getMask()) {
if (VPValue *VPMask = getMask())
Mask = State.get(VPMask);
if (isReverse())
Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
} else {
else
Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
}

if (CreateGather) {
NewLI =
Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
nullptr, "wide.masked.gather");
} else if (isReverse()) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have test TSCV

  1. use your patch, the s1112, time is from 24 to19
  2. For reverse iterative continuous load and store access, I think it can be handled by using unit load/store by updating the base address. I haven't figured out if there is anything wrong with this, please let me know if there is. These are some changes I made on your MR, can you take a look? :https://github.com/llvm/llvm-project/pull/130032/files

The TSCV test results are as follows:
image

auto *EltTy = DataTy->getElementType();
auto *PtrTy = Addr->getType();
Value *Operands[] = {
Addr,
ConstantInt::getSigned(Builder.getInt32Ty(),
-LI->getDataLayout().getTypeAllocSize(EltTy)),
Mask, EVL};
NewLI = Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_load,
{DataTy, PtrTy, Builder.getInt32Ty()},
Operands, nullptr, "vp.reverse.load");
} else {
VectorBuilder VBuilder(Builder);
VBuilder.setEVL(EVL).setMask(Mask);
Expand All @@ -2650,8 +2647,6 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
State.addMetadata(NewLI, LI);
Instruction *Res = NewLI;
if (isReverse())
Res = createReverseEVL(Builder, Res, EVL, "vp.reverse");
State.set(this, Res);
}

Expand All @@ -2670,14 +2665,13 @@ InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
unsigned AS =
getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
Ingredient.getOpcode(), Ty, Alignment, AS, Ctx.CostKind);
if (!Reverse)
return Cost;
return Ctx.TTI.getMaskedMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment,
AS, Ctx.CostKind);

return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
cast<VectorType>(Ty), {}, Ctx.CostKind,
0);
return Ctx.TTI.getStridedMemoryOpCost(Ingredient.getOpcode(), Ty,
getAddr()->getUnderlyingValue(), false,
Alignment, Ctx.CostKind);
Comment on lines +2672 to +2674
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What will happen if getStridedMemoryOpCost return Invalid?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Then the cost of VPlan will be invalid and we emit a marker?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Emitting Invalid in computeCost will cause the plan to be discarded, making vectorization impossible. At this stage, if we discover that the target does not support strided memory accesses, it is too late to fall back to using widen load + reverse for vectorizing reverse accesses.

Additionally, on targets where strided memory accesses have worse performance, this would increase the vectorization cost, potentially leading to vectorization being abandoned.

}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
Expand Down Expand Up @@ -2749,21 +2743,29 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
CallInst *NewSI = nullptr;
Value *StoredVal = State.get(StoredValue);
Value *EVL = State.get(getEVL(), VPLane(0));
if (isReverse())
StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse");
Value *Mask = nullptr;
if (VPValue *VPMask = getMask()) {
if (VPValue *VPMask = getMask())
Mask = State.get(VPMask);
if (isReverse())
Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
} else {
else
Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
}

Value *Addr = State.get(getAddr(), !CreateScatter);
if (CreateScatter) {
NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
Intrinsic::vp_scatter,
{StoredVal, Addr, Mask, EVL});
} else if (isReverse()) {
Type *StoredValTy = StoredVal->getType();
auto *EltTy = cast<VectorType>(StoredValTy)->getElementType();
auto *PtrTy = Addr->getType();
Value *Operands[] = {
StoredVal, Addr,
ConstantInt::getSigned(Builder.getInt32Ty(),
-SI->getDataLayout().getTypeAllocSize(EltTy)),
Mask, EVL};
NewSI = Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_store,
{StoredValTy, PtrTy, Builder.getInt32Ty()},
Operands);
} else {
VectorBuilder VBuilder(Builder);
VBuilder.setEVL(EVL).setMask(Mask);
Expand Down Expand Up @@ -2791,14 +2793,13 @@ InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF,
getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
unsigned AS =
getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
Ingredient.getOpcode(), Ty, Alignment, AS, Ctx.CostKind);
if (!Reverse)
return Cost;
return Ctx.TTI.getMaskedMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment,
AS, Ctx.CostKind);

return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
cast<VectorType>(Ty), {}, Ctx.CostKind,
0);
return Ctx.TTI.getStridedMemoryOpCost(Ingredient.getOpcode(), Ty,
getAddr()->getUnderlyingValue(), false,
Alignment, Ctx.CostKind);
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,21 +34,11 @@ define void @reverse_load_store(i64 %startval, ptr noalias %ptr, ptr noalias %pt
; IF-EVL-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 0
; IF-EVL-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], -1
; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP7]]
; IF-EVL-NEXT: [[TMP18:%.*]] = zext i32 [[TMP5]] to i64
; IF-EVL-NEXT: [[TMP9:%.*]] = mul i64 0, [[TMP18]]
; IF-EVL-NEXT: [[TMP10:%.*]] = sub i64 1, [[TMP18]]
; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[TMP8]], i64 [[TMP9]]
; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP10]]
; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
; IF-EVL-NEXT: [[VP_REVERSE_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i32(ptr align 4 [[TMP12]], i32 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[PTR2:%.*]], i64 [[TMP7]]
; IF-EVL-NEXT: [[TMP19:%.*]] = zext i32 [[TMP5]] to i64
; IF-EVL-NEXT: [[TMP14:%.*]] = mul i64 0, [[TMP19]]
; IF-EVL-NEXT: [[TMP15:%.*]] = sub i64 1, [[TMP19]]
; IF-EVL-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[TMP13]], i64 [[TMP14]]
; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[TMP22]], i64 [[TMP15]]
; IF-EVL-NEXT: [[VP_REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE3]], ptr align 4 [[TMP17]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0
; IF-EVL-NEXT: call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i32(<vscale x 4 x i32> [[VP_REVERSE_LOAD]], ptr align 4 [[TMP17]], i32 -4, <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: [[TMP20:%.*]] = zext i32 [[TMP5]] to i64
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP20]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
Expand Down Expand Up @@ -148,23 +138,11 @@ define void @reverse_load_store_masked(i64 %startval, ptr noalias %ptr, ptr noal
; IF-EVL-NEXT: [[TMP14:%.*]] = icmp slt <vscale x 4 x i32> [[VP_OP_LOAD]], splat (i32 100)
; IF-EVL-NEXT: [[TMP15:%.*]] = select <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> zeroinitializer
; IF-EVL-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[PTR1:%.*]], i64 [[TMP11]]
; IF-EVL-NEXT: [[TMP26:%.*]] = zext i32 [[TMP5]] to i64
; IF-EVL-NEXT: [[TMP17:%.*]] = mul i64 0, [[TMP26]]
; IF-EVL-NEXT: [[TMP18:%.*]] = sub i64 1, [[TMP26]]
; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[TMP16]], i64 [[TMP17]]
; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[TMP19]], i64 [[TMP18]]
; IF-EVL-NEXT: [[VP_REVERSE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: [[VP_OP_LOAD4:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP20]], <vscale x 4 x i1> [[VP_REVERSE_MASK]], i32 [[TMP5]])
; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_OP_LOAD4]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[TMP16]], i32 0
; IF-EVL-NEXT: [[VP_REVERSE_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.strided.load.nxv4i32.p0.i32(ptr align 4 [[TMP20]], i32 -4, <vscale x 4 x i1> [[TMP15]], i32 [[TMP5]])
; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr i32, ptr [[PTR2:%.*]], i64 [[TMP11]]
; IF-EVL-NEXT: [[TMP27:%.*]] = zext i32 [[TMP5]] to i64
; IF-EVL-NEXT: [[TMP22:%.*]] = mul i64 0, [[TMP27]]
; IF-EVL-NEXT: [[TMP23:%.*]] = sub i64 1, [[TMP27]]
; IF-EVL-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr [[TMP21]], i64 [[TMP22]]
; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP24]], i64 [[TMP23]]
; IF-EVL-NEXT: [[VP_REVERSE5:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[VP_REVERSE]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: [[VP_REVERSE_MASK6:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vp.reverse.nxv4i1(<vscale x 4 x i1> [[TMP15]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP5]])
; IF-EVL-NEXT: call void @llvm.vp.store.nxv4i32.p0(<vscale x 4 x i32> [[VP_REVERSE5]], ptr align 4 [[TMP25]], <vscale x 4 x i1> [[VP_REVERSE_MASK6]], i32 [[TMP5]])
; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP21]], i32 0
; IF-EVL-NEXT: call void @llvm.experimental.vp.strided.store.nxv4i32.p0.i32(<vscale x 4 x i32> [[VP_REVERSE_LOAD]], ptr align 4 [[TMP25]], i32 -4, <vscale x 4 x i1> [[TMP15]], i32 [[TMP5]])
; IF-EVL-NEXT: [[TMP28:%.*]] = zext i32 [[TMP5]] to i64
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP28]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
Expand Down Expand Up @@ -258,48 +236,33 @@ define void @multiple_reverse_vector_pointer(ptr noalias %a, ptr noalias %b, ptr
; IF-EVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; IF-EVL: vector.ph:
; IF-EVL-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
; IF-EVL-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
; IF-EVL-NEXT: [[TMP2:%.*]] = sub i64 [[TMP1]], 1
; IF-EVL-NEXT: [[N_RND_UP:%.*]] = add i64 1025, [[TMP2]]
; IF-EVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
; IF-EVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
; IF-EVL-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
; IF-EVL-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 16
; IF-EVL-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 8
; IF-EVL-NEXT: [[TMP5:%.*]] = sub i64 1024, [[N_VEC]]
; IF-EVL-NEXT: br label [[VECTOR_BODY:%.*]]
; IF-EVL: vector.body:
; IF-EVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ]
; IF-EVL-NEXT: [[AVL:%.*]] = sub i64 1025, [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 16, i1 true)
; IF-EVL-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 8, i1 true)
; IF-EVL-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1024, [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 0
; IF-EVL-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP7]]
; IF-EVL-NEXT: [[TMP9:%.*]] = zext i32 [[TMP6]] to i64
; IF-EVL-NEXT: [[TMP10:%.*]] = mul i64 0, [[TMP9]]
; IF-EVL-NEXT: [[TMP11:%.*]] = sub i64 1, [[TMP9]]
; IF-EVL-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP8]], i64 [[TMP10]]
; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i64 [[TMP11]]
; IF-EVL-NEXT: [[VP_OP_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP13]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
; IF-EVL-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 16 x i8> @llvm.experimental.vp.reverse.nxv16i8(<vscale x 16 x i8> [[VP_OP_LOAD]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B:%.*]], <vscale x 16 x i8> [[VP_REVERSE]]
; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 16 x i8> @llvm.vp.gather.nxv16i8.nxv16p0(<vscale x 16 x ptr> align 1 [[TMP14]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
; IF-EVL-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP8]], i32 0
; IF-EVL-NEXT: [[VP_REVERSE_LOAD:%.*]] = call <vscale x 8 x i8> @llvm.experimental.vp.strided.load.nxv8i8.p0.i32(ptr align 1 [[TMP13]], i32 -1, <vscale x 8 x i1> splat (i1 true), i32 [[TMP6]])
; IF-EVL-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B:%.*]], <vscale x 8 x i8> [[VP_REVERSE_LOAD]]
; IF-EVL-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 8 x i8> @llvm.vp.gather.nxv8i8.nxv8p0(<vscale x 8 x ptr> align 1 [[TMP14]], <vscale x 8 x i1> splat (i1 true), i32 [[TMP6]])
; IF-EVL-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[TMP7]]
; IF-EVL-NEXT: [[TMP16:%.*]] = zext i32 [[TMP6]] to i64
; IF-EVL-NEXT: [[TMP17:%.*]] = mul i64 0, [[TMP16]]
; IF-EVL-NEXT: [[TMP18:%.*]] = sub i64 1, [[TMP16]]
; IF-EVL-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[TMP15]], i64 [[TMP17]]
; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[TMP19]], i64 [[TMP18]]
; IF-EVL-NEXT: [[VP_REVERSE1:%.*]] = call <vscale x 16 x i8> @llvm.experimental.vp.reverse.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[VP_REVERSE1]], ptr align 1 [[TMP20]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
; IF-EVL-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[TMP15]], i32 0
; IF-EVL-NEXT: call void @llvm.experimental.vp.strided.store.nxv8i8.p0.i32(<vscale x 8 x i8> [[WIDE_MASKED_GATHER]], ptr align 1 [[TMP20]], i32 -1, <vscale x 8 x i1> splat (i1 true), i32 [[TMP6]])
; IF-EVL-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[D:%.*]], i64 [[TMP7]]
; IF-EVL-NEXT: [[TMP22:%.*]] = zext i32 [[TMP6]] to i64
; IF-EVL-NEXT: [[TMP23:%.*]] = mul i64 0, [[TMP22]]
; IF-EVL-NEXT: [[TMP24:%.*]] = sub i64 1, [[TMP22]]
; IF-EVL-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr [[TMP21]], i64 [[TMP23]]
; IF-EVL-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[TMP25]], i64 [[TMP24]]
; IF-EVL-NEXT: [[VP_REVERSE2:%.*]] = call <vscale x 16 x i8> @llvm.experimental.vp.reverse.nxv16i8(<vscale x 16 x i8> [[WIDE_MASKED_GATHER]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
; IF-EVL-NEXT: call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[VP_REVERSE2]], ptr align 1 [[TMP26]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP6]])
; IF-EVL-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[TMP21]], i32 0
; IF-EVL-NEXT: call void @llvm.experimental.vp.strided.store.nxv8i8.p0.i32(<vscale x 8 x i8> [[WIDE_MASKED_GATHER]], ptr align 1 [[TMP26]], i32 -1, <vscale x 8 x i1> splat (i1 true), i32 [[TMP6]])
; IF-EVL-NEXT: [[TMP27:%.*]] = zext i32 [[TMP6]] to i64
; IF-EVL-NEXT: [[INDEX_EVL_NEXT]] = add nuw i64 [[TMP27]], [[EVL_BASED_IV]]
; IF-EVL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,8 @@ define void @lshift_significand(i32 %n, ptr nocapture writeonly %dst) {
; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 0
; CHECK-NEXT: [[TMP13:%.*]] = sub nuw nsw i64 1, [[TMP12]]
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i64, ptr [[DST]], i64 [[TMP13]]
; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP11]] to i64
; CHECK-NEXT: [[TMP17:%.*]] = mul i64 0, [[TMP15]]
; CHECK-NEXT: [[TMP18:%.*]] = sub i64 1, [[TMP15]]
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i64, ptr [[TMP14]], i64 [[TMP17]]
; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i64, ptr [[TMP19]], i64 [[TMP18]]
; CHECK-NEXT: [[VP_REVERSE:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vp.reverse.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
; CHECK-NEXT: call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_REVERSE]], ptr align 8 [[TMP20]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i64, ptr [[TMP14]], i32 0
; CHECK-NEXT: call void @llvm.experimental.vp.strided.store.nxv2i64.p0.i32(<vscale x 2 x i64> zeroinitializer, ptr align 8 [[TMP20]], i32 -8, <vscale x 2 x i1> splat (i1 true), i32 [[TMP11]])
; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP11]] to i64
; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
Expand Down