Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[VPlan] Use pointer to member 0 as VPInterleaveRecipe's pointer arg. #106431

Merged
merged 12 commits into from
Oct 6, 2024
Merged
10 changes: 8 additions & 2 deletions llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -220,9 +220,15 @@ class VPBuilder {
new VPInstruction(Instruction::ICmp, Pred, A, B, DL, Name));
}

VPInstruction *createPtrAdd(VPValue *Ptr, VPValue *Offset, DebugLoc DL,
VPInstruction *createPtrAdd(VPValue *Ptr, VPValue *Offset, DebugLoc DL = {},
const Twine &Name = "") {
return createInstruction(VPInstruction::PtrAdd, {Ptr, Offset}, DL, Name);
return tryInsertInstruction(new VPInstruction(
Ptr, Offset, VPRecipeWithIRFlags::GEPFlagsTy(false), DL, Name));
}
VPValue *createInBoundsPtrAdd(VPValue *Ptr, VPValue *Offset, DebugLoc DL = {},
const Twine &Name = "") {
return tryInsertInstruction(new VPInstruction(
Ptr, Offset, VPRecipeWithIRFlags::GEPFlagsTy(true), DL, Name));
}

VPDerivedIVRecipe *createDerivedIV(InductionDescriptor::InductionKind Kind,
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9007,8 +9007,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
// Interleave memory: for each Interleave Group we marked earlier as relevant
// for this VPlan, replace the Recipes widening its memory instructions with a
// single VPInterleaveRecipe at its insertion point.
VPlanTransforms::createInterleaveGroups(InterleaveGroups, RecipeBuilder,
CM.isScalarEpilogueAllowed());
VPlanTransforms::createInterleaveGroups(
*Plan, InterleaveGroups, RecipeBuilder, CM.isScalarEpilogueAllowed());

for (ElementCount VF : Range)
Plan->addVF(VF);
Expand Down
8 changes: 7 additions & 1 deletion llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -972,7 +972,6 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe {
DisjointFlagsTy(bool IsDisjoint) : IsDisjoint(IsDisjoint) {}
};

protected:
struct GEPFlagsTy {
char IsInBounds : 1;
GEPFlagsTy(bool IsInBounds) : IsInBounds(IsInBounds) {}
Expand Down Expand Up @@ -1323,6 +1322,13 @@ class VPInstruction : public VPRecipeWithIRFlags,
assert(Opcode == Instruction::Or && "only OR opcodes can be disjoint");
}

VPInstruction(VPValue *Ptr, VPValue *Offset, GEPFlagsTy Flags = {false},
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
VPInstruction(VPValue *Ptr, VPValue *Offset, GEPFlagsTy Flags = {false},
VPInstruction(VPValue *Ptr, VPValue *Offset, GEPFlagsTy Flags,

default unneeded and may lead to potential ambiguation?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done, thanks!

DebugLoc DL = {}, const Twine &Name = "")
: VPRecipeWithIRFlags(VPDef::VPInstructionSC,
ArrayRef<VPValue *>({Ptr, Offset}),
artagnon marked this conversation as resolved.
Show resolved Hide resolved
GEPFlagsTy(Flags), DL),
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
GEPFlagsTy(Flags), DL),
Flags, DL),

?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done, thanks!

Opcode(VPInstruction::PtrAdd), Name(Name.str()) {}

VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands,
FastMathFlags FMFs, DebugLoc DL = {}, const Twine &Name = "");

Expand Down
37 changes: 12 additions & 25 deletions llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -653,7 +653,8 @@ Value *VPInstruction::generate(VPTransformState &State) {
"can only generate first lane for PtrAdd");
Value *Ptr = State.get(getOperand(0), /* IsScalar */ true);
Value *Addend = State.get(getOperand(1), /* IsScalar */ true);
return Builder.CreatePtrAdd(Ptr, Addend, Name);
return isInBounds() ? Builder.CreateInBoundsPtrAdd(Ptr, Addend, Name)
: Builder.CreatePtrAdd(Ptr, Addend, Name);
}
case VPInstruction::ResumePhi: {
Value *IncomingFromVPlanPred =
Expand Down Expand Up @@ -2478,51 +2479,37 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
unsigned InterleaveFactor = Group->getFactor();
auto *VecTy = VectorType::get(ScalarTy, State.VF * InterleaveFactor);

// Prepare for the new pointers.
unsigned Index = Group->getIndex(Instr);

// TODO: extend the masked interleaved-group support to reversed access.
VPValue *BlockInMask = getMask();
assert((!BlockInMask || !Group->isReverse()) &&
"Reversed masked interleave-group not supported.");

Value *Idx;
Value *Index;
// If the group is reverse, adjust the index to refer to the last vector lane
// instead of the first. We adjust the index from the first vector lane,
// rather than directly getting the pointer for lane VF - 1, because the
// pointer operand of the interleaved access is supposed to be uniform.
if (Group->isReverse()) {
Value *RuntimeVF =
getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF);
Idx = State.Builder.CreateSub(RuntimeVF, State.Builder.getInt32(1));
Idx = State.Builder.CreateMul(Idx,
State.Builder.getInt32(Group->getFactor()));
Idx = State.Builder.CreateAdd(Idx, State.Builder.getInt32(Index));
Idx = State.Builder.CreateNeg(Idx);
} else
Idx = State.Builder.getInt32(-Index);
Index = State.Builder.CreateSub(RuntimeVF, State.Builder.getInt32(1));
Index = State.Builder.CreateMul(Index,
State.Builder.getInt32(Group->getFactor()));
Index = State.Builder.CreateNeg(Index);
} else {
// TODO: Drop redundant 0-index GEP as follow-up.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed, better keep Idx null if not needed, to refrain from generating a redundant gep with zero.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could pull in here at the cost of a number of additional test changes or land separately.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Your call; generating gep with zero also causes some test discrepancies.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, I'll leave it as is for now, then drop separately.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Dropped in 3ec6f80

Index = State.Builder.getInt32(0);
}

VPValue *Addr = getAddr();
Value *ResAddr = State.get(Addr, VPIteration(0, 0));
if (auto *I = dyn_cast<Instruction>(ResAddr))
State.setDebugLocFrom(I->getDebugLoc());

// Notice current instruction could be any index. Need to adjust the address
// to the member of index 0.
//
// E.g. a = A[i+1]; // Member of index 1 (Current instruction)
// b = A[i]; // Member of index 0
// Current pointer is pointed to A[i+1], adjust it to A[i].
//
// E.g. A[i+1] = a; // Member of index 1
// A[i] = b; // Member of index 0
// A[i+2] = c; // Member of index 2 (Current instruction)
// Current pointer is pointed to A[i+2], adjust it to A[i].

bool InBounds = false;
if (auto *gep = dyn_cast<GetElementPtrInst>(ResAddr->stripPointerCasts()))
InBounds = gep->isInBounds();
ResAddr = State.Builder.CreateGEP(ScalarTy, ResAddr, Idx, "", InBounds);
ResAddr = State.Builder.CreateGEP(ScalarTy, ResAddr, Index, "", InBounds);

State.setDebugLocFrom(Instr->getDebugLoc());
Value *PoisonVec = PoisonValue::get(VecTy);
Expand Down
46 changes: 40 additions & 6 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1584,14 +1584,19 @@ void VPlanTransforms::dropPoisonGeneratingRecipes(
}

void VPlanTransforms::createInterleaveGroups(
const SmallPtrSetImpl<const InterleaveGroup<Instruction> *> &InterleaveGroups,
VPlan &Plan,
const SmallPtrSetImpl<const InterleaveGroup<Instruction> *>
&InterleaveGroups,
VPRecipeBuilder &RecipeBuilder, bool ScalarEpilogueAllowed) {
if (InterleaveGroups.empty())
return;

// Interleave memory: for each Interleave Group we marked earlier as relevant
// for this VPlan, replace the Recipes widening its memory instructions with a
// single VPInterleaveRecipe at its insertion point.
VPDominatorTree VPDT;
VPDT.recalculate(Plan);
for (const auto *IG : InterleaveGroups) {
auto *Recipe =
cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getInsertPos()));
SmallVector<VPValue *, 4> StoredValues;
for (unsigned i = 0; i < IG->getFactor(); ++i)
if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
Expand All @@ -1601,9 +1606,38 @@ void VPlanTransforms::createInterleaveGroups(

bool NeedsMaskForGaps =
IG->requiresScalarEpilogue() && !ScalarEpilogueAllowed;
auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
Recipe->getMask(), NeedsMaskForGaps);
VPIG->insertBefore(Recipe);

Instruction *IRInsertPos = IG->getInsertPos();
auto *InsertPos =
cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IRInsertPos));

// Get or create the start address for the interleave group.
auto *Start =
cast<VPWidenMemoryRecipe>(RecipeBuilder.getRecipe(IG->getMember(0)));
VPValue *Addr = Start->getAddr();
if (!VPDT.properlyDominates(Addr->getDefiningRecipe(), InsertPos)) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

then hoist Addr's defining recipe to insert pos, possibly hoisting additional defining recipes as needed? And/or sink loads above member zero to join it.

In general, all members of an interleave group conceptually move to its insert pos, so may as well perform actual movement. This should facilitate subsequent SLP'ing.

Interleave stores are inserted at the last member, so the address of any member should be available there. Interleaved loads are inserted at the first member, so only its address is guaranteed to be available there, although others may as well.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good, should be done as potential follow-up?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, worth leaving behind a TODO.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added, thanks!

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also handle(s) a live-in Addr?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done by checking if there's a defining recipe, thanks!

bool InBounds = false;
if (auto *Gep = dyn_cast<GetElementPtrInst>(
getLoadStorePointerOperand(IRInsertPos)->stripPointerCasts()))
InBounds = Gep->isInBounds();

// We cannot re-use the address of the first member because it does not
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// We cannot re-use the address of the first member because it does not
// We cannot re-use the address of member zero because it does not

(may be confusing: insertion point appears "first" (for loads, last for stores); here we mean "first" in terms of memory addresses.)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated, thanks!

// dominate the insert position. Use the address of the insert position
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// dominate the insert position. Use the address of the insert position
// dominate the insert position. Instead, use the address of the insert position

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated thanks.

// and create a PtrAdd to adjust the index to start at the first member.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
// and create a PtrAdd to adjust the index to start at the first member.
// and create a PtrAdd adjusting it to the address of member zero.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated, thanks!

APInt Offset(32,
getLoadStoreType(IRInsertPos)->getScalarSizeInBits() / 8 *
IG->getIndex(IRInsertPos),
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Worth asserting Offset or index of IRInsertPos is non zero?
I.e., if insert pos is the first member, its address operand must dominate it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added assert, thanks!

/*IsSigned=*/true);
VPValue *OffsetVPV = Plan.getOrAddLiveIn(
ConstantInt::get(IRInsertPos->getParent()->getContext(), -Offset));
VPBuilder B(InsertPos);
Addr = InBounds ? B.createInBoundsPtrAdd(InsertPos->getAddr(), OffsetVPV)
: B.createPtrAdd(InsertPos->getAddr(), OffsetVPV);
}
auto *VPIG = new VPInterleaveRecipe(IG, Addr, StoredValues,
InsertPos->getMask(), NeedsMaskForGaps);
VPIG->insertBefore(InsertPos);

unsigned J = 0;
for (unsigned i = 0; i < IG->getFactor(); ++i)
if (Instruction *Member = IG->getMember(i)) {
Expand Down
4 changes: 3 additions & 1 deletion llvm/lib/Transforms/Vectorize/VPlanTransforms.h
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,9 @@ struct VPlanTransforms {
// widening its memory instructions with a single VPInterleaveRecipe at its
// insertion point.
static void createInterleaveGroups(
const SmallPtrSetImpl<const InterleaveGroup<Instruction> *> &InterleaveGroups,
VPlan &Plan,
const SmallPtrSetImpl<const InterleaveGroup<Instruction> *>
&InterleaveGroups,
VPRecipeBuilder &RecipeBuilder, bool ScalarEpilogueAllowed);

/// Remove dead recipes from \p Plan.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,7 @@ define void @interleaved_store_first_order_recurrence(ptr noalias %src, ptr %dst
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; CHECK-NEXT: [[TMP3:%.*]] = mul nuw nsw i64 [[TMP0]], 3
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[TMP3]]
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 2
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 -2
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> zeroinitializer, <4 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> [[TMP10]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,11 @@ define void @test_array_load2_store2(i32 %C, i32 %D) #1 {
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
; CHECK-NEXT: [[TMP5:%.*]] = or disjoint i64 [[OFFSET_IDX]], 1
; CHECK-NEXT: [[TMP6:%.*]] = add nsw <vscale x 4 x i32> [[TMP3]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP7:%.*]] = mul nsw <vscale x 4 x i32> [[TMP4]], [[BROADCAST_SPLAT2]]
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[TMP5]]
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i64 -4
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[OFFSET_IDX]]
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]])
; CHECK-NEXT: store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP9]], align 4
; CHECK-NEXT: store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
Expand Down Expand Up @@ -121,20 +119,19 @@ define void @test_array_load2_i16_store2(i32 %C, i32 %D) #1 {
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.+]] = shl i64 [[INDEX]], 1
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x i16], ptr @AB_i16, i64 0, <vscale x 4 x i64> [[VEC_IND]]
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> [[TMP6]], i32 2, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i16> poison)
; CHECK-NEXT: [[TMP7:%.*]] = or disjoint <vscale x 4 x i64> [[VEC_IND]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i16], ptr @AB_i16, i64 0, <vscale x 4 x i64> [[TMP7]]
; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> [[TMP8]], i32 2, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i16> poison)
; CHECK-NEXT: [[TMP9:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER]] to <vscale x 4 x i32>
; CHECK-NEXT: [[TMP10:%.*]] = add nsw <vscale x 4 x i32> [[BROADCAST_SPLAT]], [[TMP9]]
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[OFFSET_IDX]]
; CHECK-NEXT: [[TMP11:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER1]] to <vscale x 4 x i32>
; CHECK-NEXT: [[TMP12:%.*]] = mul nsw <vscale x 4 x i32> [[BROADCAST_SPLAT3]], [[TMP11]]
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <vscale x 4 x i64> [[TMP7]], i64 0
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[TMP13]]
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 -4
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP10]], <vscale x 4 x i32> [[TMP12]])
; CHECK-NEXT: store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP15]], align 4
; CHECK-NEXT: store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP14]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
Expand Down Expand Up @@ -404,10 +401,10 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
; CHECK-NEXT: [[REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP11]])
; CHECK-NEXT: [[TMP12:%.*]] = add nsw <vscale x 4 x i32> [[REVERSE]], [[VEC_IND]]
; CHECK-NEXT: [[TMP13:%.*]] = sub nsw <vscale x 4 x i32> [[REVERSE1]], [[VEC_IND]]
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_ST2]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 1
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_ST2]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 0
; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vscale.i32()
; CHECK-NEXT: [[TMP16:%.*]] = shl nuw nsw i32 [[TMP15]], 3
; CHECK-NEXT: [[TMP17:%.*]] = sub nsw i32 1, [[TMP16]]
; CHECK-NEXT: [[TMP17:%.*]] = sub nsw i32 2, [[TMP16]]
; CHECK-NEXT: [[TMP18:%.*]] = sext i32 [[TMP17]] to i64
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i64 [[TMP18]]
; CHECK-NEXT: [[REVERSE2:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP12]])
Expand Down Expand Up @@ -715,16 +712,14 @@ define void @mixed_load2_store2(ptr noalias nocapture readonly %A, ptr noalias n
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
; CHECK-NEXT: [[TMP5:%.*]] = or disjoint i64 [[OFFSET_IDX]], 1
; CHECK-NEXT: [[TMP6:%.*]] = mul nsw <vscale x 4 x i32> [[TMP4]], [[TMP3]]
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[OFFSET_IDX]]
; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 0
; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 1
; CHECK-NEXT: [[TMP9:%.*]] = add nsw <vscale x 4 x i32> [[TMP8]], [[TMP7]]
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP5]]
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i64 -4
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP9]])
; CHECK-NEXT: store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP11]], align 4
; CHECK-NEXT: store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
Expand Down Expand Up @@ -1271,14 +1266,11 @@ define void @PR27626_4(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) #1 {
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP9]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
; CHECK-NEXT: [[TMP12:%.*]] = or disjoint i64 [[OFFSET_IDX]], 1
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], <vscale x 4 x i64> [[VEC_IND]]
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP12]]
; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x ptr> [[TMP13]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 -4
; CHECK-NEXT: [[P:%.+]] = extractelement <vscale x 4 x ptr> [[TMP13]], i64 0
; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[BROADCAST_SPLAT2]], <vscale x 4 x i32> [[BROADCAST_SPLAT4]])
; CHECK-NEXT: store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP15]], align 4
; CHECK-NEXT: store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[P]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
Expand Down
Loading
Loading