From e2cb8b3ad30a5d67aa485a30767a5dd1e406bab1 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Mon, 28 Oct 2024 20:14:08 +0100 Subject: [PATCH] [VPlan] Use ResumePhi to create reduction resume phis. (#110004) Use VPInstruction::ResumePhi to create phi nodes for reduction resume values in the scalar preheader, similar to how ResumePhis are used for first-order recurrence resume values after 9a5a8731e77. This allows simplifying createAndCollectMergePhiForReduction to only collect reduction resume phis when vectorizing epilogue loops and adding extra incoming edges from the main vector loop. Updating phis for the epilogue vector loops requires special attention, because additional incoming values from the bypass blocks need to be added. PR: https://github.com/llvm/llvm-project/pull/110004 --- .../Transforms/Vectorize/LoopVectorize.cpp | 130 +++++++++--------- .../RISCV/vplan-vp-intrinsics-reduction.ll | 9 ++ ...-order-recurrence-sink-replicate-region.ll | 2 + .../LoopVectorize/vplan-printing.ll | 9 ++ 4 files changed, 88 insertions(+), 62 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 88086f24dfdce2..778d928252e051 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7562,67 +7562,62 @@ static void addRuntimeUnrollDisableMetaData(Loop *L) { } } -// Check if \p RedResult is a ComputeReductionResult instruction, and if it is -// create a merge phi node for it. -static void createAndCollectMergePhiForReduction( - VPInstruction *RedResult, - VPTransformState &State, Loop *OrigLoop, BasicBlock *LoopMiddleBlock, - bool VectorizingEpilogue) { - if (!RedResult || - RedResult->getOpcode() != VPInstruction::ComputeReductionResult) +// If \p R is a ComputeReductionResult when vectorizing the epilog loop, +// fix the reduction's scalar PHI node by adding the incoming value from the +// main vector loop. +static void fixReductionScalarResumeWhenVectorizingEpilog( + VPRecipeBase *R, VPTransformState &State, BasicBlock *LoopMiddleBlock) { + auto *EpiRedResult = dyn_cast(R); + if (!EpiRedResult || + EpiRedResult->getOpcode() != VPInstruction::ComputeReductionResult) return; - auto *PhiR = cast(RedResult->getOperand(0)); - const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor(); - - Value *FinalValue = State.get(RedResult, VPLane(VPLane::getFirstLane())); - auto *ResumePhi = - dyn_cast(PhiR->getStartValue()->getUnderlyingValue()); - if (VectorizingEpilogue && RecurrenceDescriptor::isAnyOfRecurrenceKind( - RdxDesc.getRecurrenceKind())) { - auto *Cmp = cast(PhiR->getStartValue()->getUnderlyingValue()); - assert(Cmp->getPredicate() == CmpInst::ICMP_NE); - assert(Cmp->getOperand(1) == RdxDesc.getRecurrenceStartValue()); - ResumePhi = cast(Cmp->getOperand(0)); - } - assert((!VectorizingEpilogue || ResumePhi) && - "when vectorizing the epilogue loop, we need a resume phi from main " - "vector loop"); - - // TODO: bc.merge.rdx should not be created here, instead it should be - // modeled in VPlan. - BasicBlock *LoopScalarPreHeader = OrigLoop->getLoopPreheader(); - // Create a phi node that merges control-flow from the backedge-taken check - // block and the middle block. - auto *BCBlockPhi = - PHINode::Create(FinalValue->getType(), 2, "bc.merge.rdx", - LoopScalarPreHeader->getTerminator()->getIterator()); - - // If we are fixing reductions in the epilogue loop then we should already - // have created a bc.merge.rdx Phi after the main vector body. Ensure that - // we carry over the incoming values correctly. + auto *EpiRedHeaderPhi = + cast(EpiRedResult->getOperand(0)); + const RecurrenceDescriptor &RdxDesc = + EpiRedHeaderPhi->getRecurrenceDescriptor(); + Value *MainResumeValue = + EpiRedHeaderPhi->getStartValue()->getUnderlyingValue(); + if (RecurrenceDescriptor::isAnyOfRecurrenceKind( + RdxDesc.getRecurrenceKind())) { + auto *Cmp = cast(MainResumeValue); + assert(Cmp->getPredicate() == CmpInst::ICMP_NE && + "AnyOf expected to start with ICMP_NE"); + assert(Cmp->getOperand(1) == RdxDesc.getRecurrenceStartValue() && + "AnyOf expected to start by comparing main resume value to original " + "start value"); + MainResumeValue = Cmp->getOperand(0); + } + PHINode *MainResumePhi = cast(MainResumeValue); + + // When fixing reductions in the epilogue loop we should already have + // created a bc.merge.rdx Phi after the main vector body. Ensure that we carry + // over the incoming values correctly. + using namespace VPlanPatternMatch; + auto IsResumePhi = [](VPUser *U) { + return match( + U, m_VPInstruction(m_VPValue(), m_VPValue())); + }; + assert(count_if(EpiRedResult->users(), IsResumePhi) == 1 && + "ResumePhi must have a single user"); + auto *EpiResumePhiVPI = + cast(*find_if(EpiRedResult->users(), IsResumePhi)); + auto *EpiResumePhi = cast(State.get(EpiResumePhiVPI, true)); + BasicBlock *LoopScalarPreHeader = EpiResumePhi->getParent(); + bool Updated = false; for (auto *Incoming : predecessors(LoopScalarPreHeader)) { - if (Incoming == LoopMiddleBlock) - BCBlockPhi->addIncoming(FinalValue, Incoming); - else if (ResumePhi && is_contained(ResumePhi->blocks(), Incoming)) - BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming), - Incoming); - else - BCBlockPhi->addIncoming(RdxDesc.getRecurrenceStartValue(), Incoming); + if (is_contained(MainResumePhi->blocks(), Incoming)) { + assert(EpiResumePhi->getIncomingValueForBlock(Incoming) == + RdxDesc.getRecurrenceStartValue() && + "Trying to reset unexpected value"); + assert(!Updated && "Should update at most 1 incoming value"); + EpiResumePhi->setIncomingValueForBlock( + Incoming, MainResumePhi->getIncomingValueForBlock(Incoming)); + Updated = true; + } } - - auto *OrigPhi = cast(PhiR->getUnderlyingValue()); - // TODO: This fixup should instead be modeled in VPlan. - // Fix the scalar loop reduction variable with the incoming reduction sum - // from the vector body and from the backedge value. - int IncomingEdgeBlockIdx = - OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch()); - assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); - // Pick the other block. - int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); - OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi); - Instruction *LoopExitInst = RdxDesc.getLoopExitInstr(); - OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst); + assert(Updated && "Must update EpiResumePhi."); + (void)Updated; } DenseMap LoopVectorizationPlanner::executePlan( @@ -7713,11 +7708,11 @@ DenseMap LoopVectorizationPlanner::executePlan( // 2.5 Collect reduction resume values. auto *ExitVPBB = cast(BestVPlan.getVectorLoopRegion()->getSingleSuccessor()); - for (VPRecipeBase &R : *ExitVPBB) { - createAndCollectMergePhiForReduction( - dyn_cast(&R), State, OrigLoop, - State.CFG.VPBB2IRBB[ExitVPBB], VectorizingEpilogue); - } + if (VectorizingEpilogue) + for (VPRecipeBase &R : *ExitVPBB) { + fixReductionScalarResumeWhenVectorizingEpilog( + &R, State, State.CFG.VPBB2IRBB[ExitVPBB]); + } // 2.6. Maintain Loop Hints // Keep all loop hints from the original loop on the vector loop (we'll @@ -9518,6 +9513,17 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( }); FinalReductionResult->insertBefore(*MiddleVPBB, IP); + // Order is strict: if there are multiple successors, the first is the exit + // block, second is the scalar preheader. + VPBasicBlock *ScalarPHVPBB = + cast(MiddleVPBB->getSuccessors().back()); + VPBuilder ScalarPHBuilder(ScalarPHVPBB); + auto *ResumePhiRecipe = ScalarPHBuilder.createNaryOp( + VPInstruction::ResumePhi, {FinalReductionResult, PhiR->getStartValue()}, + {}, "bc.merge.rdx"); + auto *RedPhi = cast(PhiR->getUnderlyingInstr()); + Plan->addLiveOut(RedPhi, ResumePhiRecipe); + // Adjust AnyOf reductions; replace the reduction phi for the selected value // with a boolean reduction phi node to check if the condition is true in // any iteration. The final value is selected by the final diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll index 1326751a847d7d..59db6c197ef8ca 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll @@ -65,7 +65,10 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; IF-EVL-INLOOP-NEXT: No successors ; IF-EVL-INLOOP-EMPTY: ; IF-EVL-INLOOP-NEXT: scalar.ph: +; IF-EVL-INLOOP-NEXT: EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RDX]]>, ir<%start> ; IF-EVL-INLOOP-NEXT: No successors +; IF-EVL-INLOOP-EMPTY: +; IF-EVL-INLOOP-NEXT: Live-out i32 %rdx = vp<[[RED_RESUME]]> ; IF-EVL-INLOOP-NEXT: } ; @@ -104,7 +107,10 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; NO-VP-OUTLOOP-NEXT: No successors ; NO-VP-OUTLOOP-EMPTY: ; NO-VP-OUTLOOP-NEXT: scalar.ph: +; NO-VP-OUTLOOP-NEXT: EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RDX]]>, ir<%start> ; NO-VP-OUTLOOP-NEXT: No successors +; NO-VP-OUTLOOP-EMPTY: +; NO-VP-OUTLOOP-NEXT: Live-out i32 %rdx = vp<[[RED_RESUME]]> ; NO-VP-OUTLOOP-NEXT: } ; @@ -143,7 +149,10 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) { ; NO-VP-INLOOP-NEXT: No successors ; NO-VP-INLOOP-EMPTY: ; NO-VP-INLOOP-NEXT: scalar.ph: +; NO-VP-INLOOP-NEXT: EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RDX]]>, ir<%start> ; NO-VP-INLOOP-NEXT: No successors +; NO-VP-INLOOP-EMPTY: +; NO-VP-INLOOP-NEXT: Live-out i32 %rdx = vp<[[RED_RESUME]]> ; NO-VP-INLOOP-NEXT: } ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll index 8e56614a2e3d5c..b05980bef1b38f 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll @@ -232,9 +232,11 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize { ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph ; CHECK-NEXT: EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<0> +; CHECK-NEXT: EMIT vp<[[RESUME_RED:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<1234> ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: Live-out i32 %recur = vp<[[RESUME_1_P]]> +; CHECK-NEXT: Live-out i32 %and.red = vp<[[RESUME_RED]]> ; CHECK-NEXT: } ; entry: diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll index 0dde507d08be74..2247295295663e 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll @@ -165,7 +165,10 @@ define float @print_reduction(i64 %n, ptr noalias %y) { ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph +; CHECK-NEXT: EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<0.000000e+00> ; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: Live-out float %red = vp<[[RED_RESUME]]> ; CHECK-NEXT: } ; entry: @@ -221,7 +224,10 @@ define void @print_reduction_with_invariant_store(i64 %n, ptr noalias %y, ptr no ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph +; CHECK-NEXT: EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<0.000000e+00> ; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: Live-out float %red = vp<[[RED_RESUME]]> ; CHECK-NEXT: } ; entry: @@ -447,7 +453,10 @@ define float @print_fmuladd_strict(ptr %a, ptr %b, i64 %n) { ; CHECK-NEXT: No successors ; CHECK-EMPTY: ; CHECK-NEXT: scalar.ph +; CHECK-NEXT: EMIT vp<[[RED_RESUME:%.+]]> = resume-phi vp<[[RED_RES]]>, ir<0.000000e+00> ; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: Live-out float %sum.07 = vp<[[RED_RESUME]]> ; CHECK-NEXT:} entry: