diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index db650b23e271e2..74104304301a86 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7467,23 +7467,31 @@ static void addRuntimeUnrollDisableMetaData(Loop *L) { } // Check if \p RedResult is a ComputeReductionResult instruction, and if it is -// create a merge phi node for it. -static void createAndCollectMergePhiForReduction( - VPInstruction *RedResult, - VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock, - bool VectorizingEpilogue) { +// create a merge phi node for it and add incoming values from the main vector +// loop. +static void updateAndCollectMergePhiForReductionForEpilogueVectorization( + VPInstruction *RedResult, VPTransformState &State, Loop *OrigLoop, + BasicBlock *LoopMiddleBlock, bool VectorizingEpilogue) { if (!RedResult || RedResult->getOpcode() != VPInstruction::ComputeReductionResult) return; + using namespace VPlanPatternMatch; + VPValue *ResumePhiVPV = + cast(*find_if(RedResult->users(), [](VPUser *U) { + return match(U, m_VPInstruction(m_VPValue(), + m_VPValue())); + })); + auto *BCBlockPhi = cast(State.get(ResumePhiVPV, true)); auto *PhiR = cast(RedResult->getOperand(0)); const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); + if (!VectorizingEpilogue) + return; - Value *FinalValue = State.get(RedResult, VPLane(VPLane::getFirstLane())); auto *ResumePhi = dyn_cast(PhiR->getStartValue()->getUnderlyingValue()); - if (VectorizingEpilogue && RecurrenceDescriptor::isAnyOfRecurrenceKind( - RdxDesc.getRecurrenceKind())) { + if (RecurrenceDescriptor::isAnyOfRecurrenceKind( + RdxDesc.getRecurrenceKind())) { auto *Cmp = cast(PhiR->getStartValue()->getUnderlyingValue()); assert(Cmp->getPredicate() == CmpInst::ICMP_NE); assert(Cmp->getOperand(1) == RdxDesc.getRecurrenceStartValue()); @@ -7493,40 +7501,15 @@ static void createAndCollectMergePhiForReduction( "when vectorizing the epilogue loop, we need a resume phi from main " "vector loop"); - // TODO: bc.merge.rdx should not be created here, instead it should be - // modeled in VPlan. BasicBlock *LoopScalarPreHeader = OrigLoop->getLoopPreheader(); - // Create a phi node that merges control-flow from the backedge-taken check - // block and the middle block. - auto *BCBlockPhi = - PHINode::Create(FinalValue->getType(), 2, "bc.merge.rdx", - LoopScalarPreHeader->getTerminator()->getIterator()); - // If we are fixing reductions in the epilogue loop then we should already // have created a bc.merge.rdx Phi after the main vector body. Ensure that // we carry over the incoming values correctly. for (auto *Incoming : predecessors(LoopScalarPreHeader)) { - if (Incoming == LoopMiddleBlock) - BCBlockPhi->addIncoming(FinalValue, Incoming); - else if (ResumePhi && is_contained(ResumePhi->blocks(), Incoming)) - BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming), - Incoming); - else - BCBlockPhi->addIncoming(RdxDesc.getRecurrenceStartValue(), Incoming); + if (ResumePhi && is_contained(ResumePhi->blocks(), Incoming)) + BCBlockPhi->setIncomingValueForBlock( + Incoming, ResumePhi->getIncomingValueForBlock(Incoming)); } - - auto *OrigPhi = cast(PhiR->getUnderlyingValue()); - // TODO: This fixup should instead be modeled in VPlan. - // Fix the scalar loop reduction variable with the incoming reduction sum - // from the vector body and from the backedge value. - int IncomingEdgeBlockIdx = - OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); - assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); - // Pick the other block. - int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); - OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); - Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); - OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); } DenseMap LoopVectorizationPlanner::executePlan( @@ -7617,11 +7600,12 @@ DenseMap LoopVectorizationPlanner::executePlan( // 2.5 Collect reduction resume values. auto *ExitVPBB = cast(BestVPlan.getVectorLoopRegion()->getSingleSuccessor()); - for (VPRecipeBase &R : *ExitVPBB) { - createAndCollectMergePhiForReduction( - dyn_cast(&R), State, OrigLoop, - State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs); - } + if (IsEpilogueVectorization) + for (VPRecipeBase &R : *ExitVPBB) { + updateAndCollectMergePhiForReductionForEpilogueVectorization( + dyn_cast(&R), State, OrigLoop, + State.CFG.VPBB2IRBB[ExitVPBB], ExpandedSCEVs); + } // 2.6. Maintain Loop Hints // Keep all loop hints from the original loop on the vector loop (we'll @@ -9411,6 +9395,22 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( }); FinalReductionResult->insertBefore(*MiddleVPBB, IP); + VPBasicBlock *ScalarPHVPBB = nullptr; + if (MiddleVPBB->getNumSuccessors() == 2) { + // Order is strict: first is the exit block, second is the scalar + // preheader. + ScalarPHVPBB = cast(MiddleVPBB->getSuccessors()[1]); + } else { + ScalarPHVPBB = cast(MiddleVPBB->getSingleSuccessor()); + } + + VPBuilder ScalarPHBuilder(ScalarPHVPBB); + auto *ResumePhiRecipe = ScalarPHBuilder.createNaryOp( + VPInstruction::ResumePhi, {FinalReductionResult, PhiR->getStartValue()}, + {}, "bc.merge.rdx"); + auto *RedPhi = cast(PhiR->getUnderlyingInstr()); + Plan->addLiveOut(RedPhi, ResumePhiRecipe); + // Adjust AnyOf reductions; replace the reduction phi for the selected value // with a boolean reduction phi node to check if the condition is true in // any iteration. The final value is selected by the final diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll index 90c209cf3f5186..6a435709aeb2b2 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll @@ -65,7 +65,10 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; IF-EVL-INLOOP-NEXT: No successors ; IF-EVL-INLOOP-EMPTY: ; IF-EVL-INLOOP-NEXT: scalar.ph: +; IF-EVL-INLOOP-NEXT: EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RDX]]>, ir<%start> ; IF-EVL-INLOOP-NEXT: No successors +; IF-EVL-INLOOP-EMPTY: +; IF-EVL-INLOOP-NEXT: Live-out i32 %rdx = vp<[[RED_RESUME]]> ; IF-EVL-INLOOP-NEXT: } ; @@ -104,7 +107,10 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; NO-VP-OUTLOOP-NEXT: No successors ; NO-VP-OUTLOOP-EMPTY: ; NO-VP-OUTLOOP-NEXT: scalar.ph: +; NO-VP-OUTLOOP-NEXT: EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RDX]]>, ir<%start> ; NO-VP-OUTLOOP-NEXT: No successors +; NO-VP-OUTLOOP-EMPTY: +; NO-VP-OUTLOOP-NEXT: Live-out i32 %rdx = vp<[[RED_RESUME]]> ; NO-VP-OUTLOOP-NEXT: } ; @@ -143,7 +149,10 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; NO-VP-INLOOP-NEXT: No successors ; NO-VP-INLOOP-EMPTY: ; NO-VP-INLOOP-NEXT: scalar.ph: +; NO-VP-INLOOP-NEXT: EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RDX]]>, ir<%start> ; NO-VP-INLOOP-NEXT: No successors +; NO-VP-INLOOP-EMPTY: +; NO-VP-INLOOP-NEXT: Live-out i32 %rdx = vp<[[RED_RESUME]]> ; NO-VP-INLOOP-NEXT: } ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll index 8e56614a2e3d5c..b05980bef1b38f 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll @@ -232,9 +232,11 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize { ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph ; CHECK-NEXT: EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0> +; CHECK-NEXT: EMIT vp<[[RESUME_RED:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<1234> ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: Live-out i32 %recur = vp<[[RESUME_1_P]]> +; CHECK-NEXT: Live-out i32 %and.red = vp<[[RESUME_RED]]> ; CHECK-NEXT: } ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll index 0dde507d08be74..2247295295663e 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll @@ -165,7 +165,10 @@ define float @print_reduction(i64 %n, ptr noalias %y) { ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph +; CHECK-NEXT: EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<0.000000e+00> ; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: Live-out float %red = vp<[[RED_RESUME]]> ; CHECK-NEXT: } ; entry: @@ -221,7 +224,10 @@ define void @print_reduction_with_invariant_store(i64 %n, ptr noalias %y, ptr no ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph +; CHECK-NEXT: EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<0.000000e+00> ; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: Live-out float %red = vp<[[RED_RESUME]]> ; CHECK-NEXT: } ; entry: @@ -447,7 +453,10 @@ define float @print_fmuladd_strict(ptr %a, ptr %b, i64 %n) { ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph +; CHECK-NEXT: EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<0.000000e+00> ; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: Live-out float %sum.07 = vp<[[RED_RESUME]]> ; CHECK-NEXT:} entry: