diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 30fadc2c939418..cb55b6636fc279 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7507,6 +7507,9 @@ LoopVectorizationPlanner::executePlan( OrigLoop->getHeader()->getModule()->getContext()); VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE); + if (VPlanTransforms::narrowInterleaveGroups(BestVPlan, BestVF)) { + LLVM_DEBUG(dbgs() << "Narrowed interleave\n"); + } LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF << '\n'); BestVPlan.setName("Final VPlan"); diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 1a039e0a736145..96b16164738181 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -48,6 +48,9 @@ extern cl::opt ForceTargetInstructionCost; bool VPRecipeBase::mayWriteToMemory() const { switch (getVPDefID()) { + case VPInstructionSC: { + return !Instruction::isBinaryOp(cast(this)->getOpcode()); + } case VPInterleaveSC: return cast(this)->getNumStoreOperands() > 0; case VPWidenStoreEVLSC: @@ -63,6 +66,7 @@ bool VPRecipeBase::mayWriteToMemory() const { case VPBranchOnMaskSC: case VPScalarIVStepsSC: case VPPredInstPHISC: + case VPVectorPointerSC: return false; case VPBlendSC: case VPReductionEVLSC: diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 3b7f066f0636f3..285c5be7a105d5 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -668,6 +668,7 @@ static void recursivelyDeleteDeadRecipes(VPValue *V) { void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, unsigned BestUF, PredicatedScalarEvolution &PSE) { + assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan"); assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan"); VPBasicBlock *ExitingVPBB = @@ -710,6 +711,7 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, // TODO: Further simplifications are possible // 1. Replace inductions with constants. // 2. Replace vector loop region with VPBasicBlock. + // } /// Sink users of \p FOR after the recipe defining the previous value \p @@ -1657,3 +1659,129 @@ void VPlanTransforms::createInterleaveGroups( } } } + +static bool supportedLoad(VPWidenRecipe *R0, VPValue *V, unsigned Idx) { + if (auto *W = dyn_cast_or_null(V->getDefiningRecipe())) { + if (W->getMask()) + return false; + return !W->getMask() && (R0->getOperand(0) == V || R0->getOperand(1) == V); + } + + if (auto *IR = dyn_cast_or_null(V->getDefiningRecipe())) { + return IR->getInterleaveGroup()->getFactor() == + IR->getInterleaveGroup()->getNumMembers() && + IR->getVPValue(Idx) == V; + } + return false; +} + +/// Returns true of \p IR is a consecutive interleave group with \p VF members. +static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *IR, + ElementCount VF) { + if (!IR) + return false; + auto IG = IR->getInterleaveGroup(); + return IG->getFactor() == IG->getNumMembers() && + IG->getNumMembers() == VF.getKnownMinValue(); +} + +bool VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF) { + using namespace llvm::VPlanPatternMatch; + if (VF.isScalable()) + return false; + + bool Changed = false; + SmallVector StoreGroups; + for (auto &R : make_early_inc_range( + *Plan.getVectorLoopRegion()->getEntryBasicBlock())) { + if (match(&R, m_BranchOnCount(m_VPValue(), m_VPValue())) || + isa(&R)) + continue; + + // Bail out on recipes not supported at the moment: + // * phi recipes other than the canonical induction + // * recipes writing to memory except interleave groups + // Only support plans with a canonical induction phi. + if ((R.isPhi() && !isa(&R)) || + (R.mayWriteToMemory() && !isa(&R))) + return false; + + auto *IR = dyn_cast(&R); + if (!IR) + continue; + + if (!isConsecutiveInterleaveGroup(IR, VF)) + return false; + if (IR->getStoredValues().empty()) + continue; + + auto *Lane0 = dyn_cast_or_null( + IR->getStoredValues()[0]->getDefiningRecipe()); + if (!Lane0) + return false; + for (const auto &[I, V] : enumerate(IR->getStoredValues())) { + auto *R = dyn_cast(V->getDefiningRecipe()); + if (!R || R->getOpcode() != Lane0->getOpcode()) + return false; + // Work around captured structured bindings being a C++20 extension. + auto Idx = I; + if (any_of(R->operands(), [Lane0, Idx](VPValue *V) { + return !supportedLoad(Lane0, V, Idx); + })) + return false; + } + + StoreGroups.push_back(IR); + } + + // Narrow operation tree rooted at store groups. + for (auto *StoreGroup : StoreGroups) { + auto *Lane0 = cast( + StoreGroup->getStoredValues()[0]->getDefiningRecipe()); + + unsigned LoadGroupIdx = + isa(Lane0->getOperand(1)->getDefiningRecipe()) ? 1 + : 0; + unsigned WideLoadIdx = 1 - LoadGroupIdx; + auto *LoadGroup = cast( + Lane0->getOperand(LoadGroupIdx)->getDefiningRecipe()); + + auto *WideLoad = cast( + Lane0->getOperand(WideLoadIdx)->getDefiningRecipe()); + + // Narrow wide load to uniform scalar load, as transformed VPlan will only + // process one original iteration. + auto *N = new VPReplicateRecipe(&WideLoad->getIngredient(), + WideLoad->operands(), true); + // Narrow interleave group to wide load, as transformed VPlan will only + // process one original iteration. + auto *L = new VPWidenLoadRecipe( + *cast(LoadGroup->getInterleaveGroup()->getInsertPos()), + LoadGroup->getAddr(), LoadGroup->getMask(), true, false, + LoadGroup->getDebugLoc()); + L->insertBefore(LoadGroup); + N->insertBefore(LoadGroup); + Lane0->setOperand(LoadGroupIdx, L); + Lane0->setOperand(WideLoadIdx, N); + + auto *S = new VPWidenStoreRecipe( + *cast(StoreGroup->getInterleaveGroup()->getInsertPos()), + StoreGroup->getAddr(), Lane0, nullptr, true, false, + StoreGroup->getDebugLoc()); + S->insertBefore(StoreGroup); + StoreGroup->eraseFromParent(); + Changed = true; + } + + if (!Changed) + return false; + + // Adjust induction to reflect that the transformed plan only processes one + // original iteration. + auto *CanIV = Plan.getCanonicalIV(); + VPInstruction *Inc = cast(CanIV->getBackedgeValue()); + Inc->setOperand( + 1, Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1))); + removeDeadRecipes(Plan); + return true; +} diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 3b792ee32dce6e..0bb9d0a7fde853 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -121,6 +121,8 @@ struct VPlanTransforms { /// Remove dead recipes from \p Plan. static void removeDeadRecipes(VPlan &Plan); + + static bool narrowInterleaveGroups(VPlan &Plan, ElementCount VF); }; } // namespace llvm diff --git a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll index d6cd44d0c4f0ca..ffe66ff0007fbb 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll @@ -20,28 +20,16 @@ define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) ; CHECK-NEXT: [[IV:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[ARRAYIDX]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[TMP8]], align 8 -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <4 x i32> -; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <4 x i32> -; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <4 x i32> -; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]] -; CHECK-NEXT: [[TMP14:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]] -; CHECK-NEXT: [[TMP7:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC2]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 3 -; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC3]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 -3 -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP14]], <8 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> [[TMP9]], <8 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> [[TMP12]], <16 x i32> -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP13]], <16 x i64> poison, <16 x i32> -; CHECK-NEXT: store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load i64, ptr [[TMP2]], align 8 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i64 0 +; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = mul <4 x i64> [[STRIDED_VEC2]], [[WIDE_LOAD]] +; CHECK-NEXT: store <4 x i64> [[TMP7]], ptr [[TMP3]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] @@ -50,23 +38,23 @@ define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) ; CHECK-NEXT: br label %[[LOOP:.*]] ; CHECK: [[LOOP]]: ; CHECK-NEXT: [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV1]] -; CHECK-NEXT: [[L_FACTOR:%.*]] = load i64, ptr [[ARRAYIDX1]], align 8 +; CHECK-NEXT: [[DATA_2:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV1]] +; CHECK-NEXT: [[L_2:%.*]] = load i64, ptr [[DATA_2]], align 8 ; CHECK-NEXT: [[DATA_0:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 0 ; CHECK-NEXT: [[L_0:%.*]] = load i64, ptr [[DATA_0]], align 8 -; CHECK-NEXT: [[MUL_0:%.*]] = mul i64 [[L_FACTOR]], [[L_0]] +; CHECK-NEXT: [[MUL_0:%.*]] = mul i64 [[L_2]], [[L_0]] ; CHECK-NEXT: store i64 [[MUL_0]], ptr [[DATA_0]], align 8 ; CHECK-NEXT: [[DATA_1:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 1 ; CHECK-NEXT: [[L_1:%.*]] = load i64, ptr [[DATA_1]], align 8 -; CHECK-NEXT: [[MUL_1:%.*]] = mul i64 [[L_FACTOR]], [[L_1]] +; CHECK-NEXT: [[MUL_1:%.*]] = mul i64 [[L_2]], [[L_1]] ; CHECK-NEXT: store i64 [[MUL_1]], ptr [[DATA_1]], align 8 -; CHECK-NEXT: [[DATA_2:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 2 -; CHECK-NEXT: [[L_2:%.*]] = load i64, ptr [[DATA_2]], align 8 -; CHECK-NEXT: [[MUL_2:%.*]] = mul i64 [[L_FACTOR]], [[L_2]] -; CHECK-NEXT: store i64 [[MUL_2]], ptr [[DATA_2]], align 8 +; CHECK-NEXT: [[DATA_4:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 2 +; CHECK-NEXT: [[L_4:%.*]] = load i64, ptr [[DATA_4]], align 8 +; CHECK-NEXT: [[MUL_2:%.*]] = mul i64 [[L_2]], [[L_4]] +; CHECK-NEXT: store i64 [[MUL_2]], ptr [[DATA_4]], align 8 ; CHECK-NEXT: [[DATA_3:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 3 ; CHECK-NEXT: [[L_3:%.*]] = load i64, ptr [[DATA_3]], align 8 -; CHECK-NEXT: [[MUL_3:%.*]] = mul i64 [[L_FACTOR]], [[L_3]] +; CHECK-NEXT: [[MUL_3:%.*]] = mul i64 [[L_2]], [[L_3]] ; CHECK-NEXT: store i64 [[MUL_3]], ptr [[DATA_3]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] @@ -129,13 +117,10 @@ define void @test_2xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> ; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]] -; CHECK-NEXT: [[TMP7:%.*]] = or disjoint i64 [[TMP3]], 1 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 -1 ; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> [[TMP9]], <8 x i32> ; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> poison, <8 x i32> -; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 8 +; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] @@ -212,14 +197,11 @@ define void @test_2xi64_interleave_loads_order_flipped(ptr noalias %data, ptr no ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP5]], align 8 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> ; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = or disjoint i64 [[TMP3]], 1 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]] ; CHECK-NEXT: [[TMP8:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]] ; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 -1 ; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP8]], <4 x i64> [[TMP9]], <8 x i32> ; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> poison, <8 x i32> -; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 8 +; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] @@ -297,13 +279,10 @@ define void @test_2xi64_store_order_flipped_1(ptr noalias %data, ptr noalias %fa ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> ; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]] -; CHECK-NEXT: [[TMP7:%.*]] = or disjoint i64 [[TMP3]], 1 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 -1 ; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> [[TMP6]], <8 x i32> ; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> poison, <8 x i32> -; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 8 +; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] @@ -458,6 +437,7 @@ define void @test_2xi64_different_loads_feeding_fmul(ptr noalias %data, ptr noal ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[GEP_SRC_0]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8 ; CHECK-NEXT: [[TMP5:%.*]] = shl nsw i64 [[IV]], 1 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP5]] ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8 ; CHECK-NEXT: [[TMP6:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[WIDE_LOAD1]] ; CHECK-NEXT: [[TMP7:%.*]] = or disjoint i64 [[TMP5]], 1 @@ -469,7 +449,7 @@ define void @test_2xi64_different_loads_feeding_fmul(ptr noalias %data, ptr noal ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0 ; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP11]], align 8 ; CHECK-NEXT: [[TMP12:%.*]] = mul <4 x i64> [[WIDE_LOAD2]], [[STRIDED_VEC]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 -1 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP16]], i32 0 ; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> [[TMP12]], <8 x i32> ; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP14]], <8 x i64> poison, <8 x i32> ; CHECK-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP13]], align 8 @@ -554,14 +534,12 @@ define void @test_3xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) ; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC]] ; CHECK-NEXT: [[TMP6:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC1]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds { i64, i64, i64 }, ptr [[DATA]], i64 [[IV]], i32 2 ; CHECK-NEXT: [[TMP8:%.*]] = mul <4 x i64> [[WIDE_LOAD]], [[STRIDED_VEC2]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 -2 ; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> [[TMP6]], <8 x i32> ; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP8]], <4 x i64> poison, <8 x i32> ; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i64> [[TMP10]], <8 x i64> [[TMP11]], <12 x i32> ; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP12]], <12 x i64> poison, <12 x i32> -; CHECK-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP9]], align 8 +; CHECK-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] @@ -693,14 +671,12 @@ define void @test_3xi32(ptr noalias %data, ptr noalias %factor, i64 noundef %n) ; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <24 x i32> [[WIDE_VEC1]], <24 x i32> poison, <8 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = mul <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC2]] ; CHECK-NEXT: [[TMP8:%.*]] = mul <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC3]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds { i32, i32, i32 }, ptr [[DATA]], i64 [[IV]], i32 2 ; CHECK-NEXT: [[TMP10:%.*]] = mul <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC4]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 -2 ; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> [[TMP8]], <16 x i32> ; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <8 x i32> [[TMP10]], <8 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x i32> [[TMP12]], <16 x i32> [[TMP13]], <24 x i32> ; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP14]], <24 x i32> poison, <24 x i32> -; CHECK-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP11]], align 8 +; CHECK-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP6]], align 8 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]