From f99672568fda6a9bc1ee4f09d5d84066f4979889 Mon Sep 17 00:00:00 2001 From: Kerry McLaughlin Date: Mon, 28 Jun 2021 11:26:10 +0100 Subject: [PATCH] [LoopVectorize] Fix strict reductions where VF = 1 Currently we will allow loops with a fixed width VF of 1 to vectorize if the -enable-strict-reductions flag is set. However, the loop vectorizer will not use ordered reductions if `VF.isScalar()` and the resulting vectorized loop will be out of order. This patch removes `VF.isVector()` when checking if ordered reductions should be used. Also, instead of converting the FAdds to reductions if the VF = 1, operands of the FAdds are changed such that the order is preserved. Reviewed By: david-arm Differential Revision: https://reviews.llvm.org/D104533 --- .../Vectorize/LoopVectorizationPlanner.h | 3 +- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 25 +++--- .../LoopVectorize/AArch64/strict-fadd.ll | 97 +++++++++++++++++++--- 3 files changed, 103 insertions(+), 22 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index 2213e73..5c4c4fd 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -356,7 +356,8 @@ private: /// reductions, with one operand being vector and the other being the scalar /// reduction chain. void adjustRecipesForInLoopReductions(VPlanPtr &Plan, - VPRecipeBuilder &RecipeBuilder); + VPRecipeBuilder &RecipeBuilder, + ElementCount MinVF); }; } // namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index cec0cf2..e609fdd 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4344,8 +4344,7 @@ void InnerLoopVectorizer::fixReduction(VPWidenPHIRecipe *PhiR, // any loop invariant values. BasicBlock *VectorLoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); - bool IsOrdered = State.VF.isVector() && IsInLoopReductionPhi && - Cost->useOrderedReductions(RdxDesc); + bool IsOrdered = IsInLoopReductionPhi && Cost->useOrderedReductions(RdxDesc); for (unsigned Part = 0; Part < UF; ++Part) { if (IsOrdered && Part > 0) @@ -4759,8 +4758,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, Type *VecTy = ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), State.VF); - bool IsOrdered = State.VF.isVector() && - Cost->isInLoopReduction(cast(PN)) && + bool IsOrdered = Cost->isInLoopReduction(cast(PN)) && Cost->useOrderedReductions(*RdxDesc); unsigned LastPartForNewPhi = IsOrdered ? 1 : State.UF; for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) { @@ -9280,8 +9278,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( } // Adjust the recipes for any inloop reductions. - if (Range.Start.isVector()) - adjustRecipesForInLoopReductions(Plan, RecipeBuilder); + adjustRecipesForInLoopReductions(Plan, RecipeBuilder, Range.Start); // Finally, if tail is folded by masking, introduce selects between the phi // and the live-out instruction of each reduction, at the end of the latch. @@ -9356,12 +9353,15 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { // reductions, with one operand being vector and the other being the scalar // reduction chain. void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( - VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder) { + VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) { for (auto &Reduction : CM.getInLoopReductionChains()) { PHINode *Phi = Reduction.first; RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi]; const SmallVector &ReductionOperations = Reduction.second; + if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc)) + continue; + // ReductionOperations are orders top-down from the phi's use to the // LoopExitValue. We keep a track of the previous item (the Chain) to tell // which of the two operands will remain scalar and which will be reduced. @@ -9378,7 +9378,7 @@ void LoopVectorizationPlanner::adjustRecipesForInLoopReductions( "Expected to replace a VPWidenSelectSC"); FirstOpId = 1; } else { - assert(isa(WidenRecipe) && + assert((MinVF.isScalar() || isa(WidenRecipe)) && "Expected to replace a VPWidenSC"); FirstOpId = 0; } @@ -9527,8 +9527,13 @@ void VPReductionRecipe::execute(VPTransformState &State) { Value *NewRed; Value *NextInChain; if (IsOrdered) { - NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, - PrevInChain); + if (State.VF.isVector()) + NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, + PrevInChain); + else + NewRed = State.Builder.CreateBinOp( + (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), + PrevInChain, NewVecOp); PrevInChain = NewRed; } else { PrevInChain = State.get(getChainOp(), Part); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll index 9e402f9..f2d5f42 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll @@ -693,14 +693,89 @@ for.end: ret float %add6 } -!0 = distinct !{!0, !4, !7, !9} -!1 = distinct !{!1, !4, !8, !9} -!2 = distinct !{!2, !5, !7, !9} -!3 = distinct !{!3, !6, !7, !9, !10} -!4 = !{!"llvm.loop.vectorize.width", i32 8} -!5 = !{!"llvm.loop.vectorize.width", i32 4} -!6 = !{!"llvm.loop.vectorize.width", i32 2} -!7 = !{!"llvm.loop.interleave.count", i32 1} -!8 = !{!"llvm.loop.interleave.count", i32 4} -!9 = !{!"llvm.loop.vectorize.enable", i1 true} -!10 = !{!"llvm.loop.vectorize.predicate.enable", i1 true} +; Test reductions for a VF of 1 and a UF > 1. +define float @fadd_scalar_vf(float* noalias nocapture readonly %a, i64 %n) { +; CHECK-ORDERED-LABEL: @fadd_scalar_vf +; CHECK-ORDERED: vector.body +; CHECK-ORDERED: %[[VEC_PHI:.*]] = phi float [ 0.000000e+00, {{.*}} ], [ %[[FADD4:.*]], %vector.body ] +; CHECK-ORDERED: %[[LOAD1:.*]] = load float, float* +; CHECK-ORDERED: %[[LOAD2:.*]] = load float, float* +; CHECK-ORDERED: %[[LOAD3:.*]] = load float, float* +; CHECK-ORDERED: %[[LOAD4:.*]] = load float, float* +; CHECK-ORDERED: %[[FADD1:.*]] = fadd float %[[VEC_PHI]], %[[LOAD1]] +; CHECK-ORDERED: %[[FADD2:.*]] = fadd float %[[FADD1]], %[[LOAD2]] +; CHECK-ORDERED: %[[FADD3:.*]] = fadd float %[[FADD2]], %[[LOAD3]] +; CHECK-ORDERED: %[[FADD4]] = fadd float %[[FADD3]], %[[LOAD4]] +; CHECK-ORDERED-NOT: call float @llvm.vector.reduce.fadd +; CHECK-ORDERED: scalar.ph +; CHECK-ORDERED: %[[MERGE_RDX:.*]] = phi float [ 0.000000e+00, %entry ], [ %[[FADD4]], %middle.block ] +; CHECK-ORDERED: for.body +; CHECK-ORDERED: %[[SUM_PHI:.*]] = phi float [ %[[MERGE_RDX]], %scalar.ph ], [ %[[FADD5:.*]], %for.body ] +; CHECK-ORDERED: %[[LOAD5:.*]] = load float, float* +; CHECK-ORDERED: %[[FADD5]] = fadd float %[[LOAD5]], %[[SUM_PHI]] +; CHECK-ORDERED: for.end +; CHECK-ORDERED: %[[RES_PHI:.*]] = phi float [ %[[FADD5]], %for.body ], [ %[[FADD4]], %middle.block ] +; CHECK-ORDERED: ret float %[[RES_PHI]] + +; CHECK-UNORDERED-LABEL: @fadd_scalar_vf +; CHECK-UNORDERED: vector.body +; CHECK-UNORDERED: %[[VEC_PHI1:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[FADD1:.*]], %vector.body ] +; CHECK-UNORDERED: %[[VEC_PHI2:.*]] = phi float [ -0.000000e+00, %vector.ph ], [ %[[FADD2:.*]], %vector.body ] +; CHECK-UNORDERED: %[[VEC_PHI3:.*]] = phi float [ -0.000000e+00, %vector.ph ], [ %[[FADD3:.*]], %vector.body ] +; CHECK-UNORDERED: %[[VEC_PHI4:.*]] = phi float [ -0.000000e+00, %vector.ph ], [ %[[FADD4:.*]], %vector.body ] +; CHECK-UNORDERED: %[[LOAD1:.*]] = load float, float* +; CHECK-UNORDERED: %[[LOAD2:.*]] = load float, float* +; CHECK-UNORDERED: %[[LOAD3:.*]] = load float, float* +; CHECK-UNORDERED: %[[LOAD4:.*]] = load float, float* +; CHECK-UNORDERED: %[[FADD1]] = fadd float %[[LOAD1]], %[[VEC_PHI1]] +; CHECK-UNORDERED: %[[FADD2]] = fadd float %[[LOAD2]], %[[VEC_PHI2]] +; CHECK-UNORDERED: %[[FADD3]] = fadd float %[[LOAD3]], %[[VEC_PHI3]] +; CHECK-UNORDERED: %[[FADD4]] = fadd float %[[LOAD4]], %[[VEC_PHI4]] +; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd +; CHECK-UNORDERED: middle.block +; CHECK-UNORDERED: %[[BIN_RDX1:.*]] = fadd float %[[FADD2]], %[[FADD1]] +; CHECK-UNORDERED: %[[BIN_RDX2:.*]] = fadd float %[[FADD3]], %[[BIN_RDX1]] +; CHECK-UNORDERED: %[[BIN_RDX3:.*]] = fadd float %[[FADD4]], %[[BIN_RDX2]] +; CHECK-UNORDERED: scalar.ph +; CHECK-UNORDERED: %[[MERGE_RDX:.*]] = phi float [ 0.000000e+00, %entry ], [ %[[BIN_RDX3]], %middle.block ] +; CHECK-UNORDERED: for.body +; CHECK-UNORDERED: %[[SUM_PHI:.*]] = phi float [ %[[MERGE_RDX]], %scalar.ph ], [ %[[FADD5:.*]], %for.body ] +; CHECK-UNORDERED: %[[LOAD5:.*]] = load float, float* +; CHECK-UNORDERED: %[[FADD5]] = fadd float %[[LOAD5]], %[[SUM_PHI]] +; CHECK-UNORDERED: for.end +; CHECK-UNORDERED: %[[RES_PHI:.*]] = phi float [ %[[FADD5]], %for.body ], [ %[[BIN_RDX3]], %middle.block ] +; CHECK-UNORDERED: ret float %[[RES_PHI]] + +; CHECK-NOT-VECTORIZED-LABEL: @fadd_scalar_vf +; CHECK-NOT-VECTORIZED-NOT: @vector.body + +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %sum.07 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %iv + %0 = load float, float* %arrayidx, align 4 + %add = fadd float %0, %sum.07 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !4 + +for.end: + ret float %add +} + +!0 = distinct !{!0, !5, !9, !11} +!1 = distinct !{!1, !5, !10, !11} +!2 = distinct !{!2, !6, !9, !11} +!3 = distinct !{!3, !7, !9, !11, !12} +!4 = distinct !{!4, !8, !10, !11} +!5 = !{!"llvm.loop.vectorize.width", i32 8} +!6 = !{!"llvm.loop.vectorize.width", i32 4} +!7 = !{!"llvm.loop.vectorize.width", i32 2} +!8 = !{!"llvm.loop.vectorize.width", i32 1} +!9 = !{!"llvm.loop.interleave.count", i32 1} +!10 = !{!"llvm.loop.interleave.count", i32 4} +!11 = !{!"llvm.loop.vectorize.enable", i1 true} +!12 = !{!"llvm.loop.vectorize.predicate.enable", i1 true} -- 2.7.4