cl::desc("Prefer in-loop vector reductions, "
"overriding the targets preference."));
-// FIXME: When loop hints are passed which allow reordering of FP operations,
-// we still choose to use strict reductions with this flag. We should instead
-// use the default behaviour of vectorizing with unordered reductions if
-// reordering is allowed.
cl::opt<bool> EnableStrictReductions(
"enable-strict-reductions", cl::init(false), cl::Hidden,
cl::desc("Enable the vectorisation of loops with in-order (strict) "
/// Fix the non-induction PHIs in the OrigPHIsToFix vector.
void fixNonInductionPHIs(VPTransformState &State);
+ /// Returns true if the reordering of FP operations is not allowed, but we are
+ /// able to vectorize with strict in-order reductions for the given RdxDesc.
+ bool useOrderedReductions(RecurrenceDescriptor &RdxDesc);
+
/// Create a broadcast instruction. This method generates a broadcast
/// instruction (shuffle) for loop invariant values and for the induction
/// value. If this is the induction variable then we extend it to N, N+1, ...
/// outside. In loop reductions are collected into InLoopReductionChains.
void collectInLoopReductions();
+ /// Returns true if we should use strict in-order reductions for the given
+ /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
+ /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
+ /// of FP operations.
+ bool useOrderedReductions(RecurrenceDescriptor &RdxDesc) {
+ return EnableStrictReductions && !Hints->allowReordering() &&
+ RdxDesc.isOrdered();
+ }
+
/// \returns The smallest bitwidth each instruction can be represented with.
/// The vector equivalents of these instructions should be truncated to this
/// type.
LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
}
-static bool useOrderedReductions(RecurrenceDescriptor &RdxDesc) {
- return EnableStrictReductions && RdxDesc.isOrdered();
-}
-
void InnerLoopVectorizer::fixReduction(VPWidenPHIRecipe *PhiR,
VPTransformState &State) {
PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
BasicBlock *VectorLoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
bool IsOrdered = State.VF.isVector() && IsInLoopReductionPhi &&
- useOrderedReductions(RdxDesc);
+ Cost->useOrderedReductions(RdxDesc);
for (unsigned Part = 0; Part < UF; ++Part) {
if (IsOrdered && Part > 0)
}
}
+bool InnerLoopVectorizer::useOrderedReductions(RecurrenceDescriptor &RdxDesc) {
+ return Cost->useOrderedReductions(RdxDesc);
+}
+
void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef,
VPUser &Operands, unsigned UF,
ElementCount VF, bool IsPtrLoopInvariant,
bool IsOrdered = State.VF.isVector() &&
Cost->isInLoopReduction(cast<PHINode>(PN)) &&
- useOrderedReductions(*RdxDesc);
+ Cost->useOrderedReductions(*RdxDesc);
for (unsigned Part = 0; Part < State.UF; ++Part) {
// This is phase one of vectorizing PHIs.
Value *PrevInChain = State.get(getChainOp(), 0);
for (unsigned Part = 0; Part < State.UF; ++Part) {
RecurKind Kind = RdxDesc->getRecurrenceKind();
- bool IsOrdered = useOrderedReductions(*RdxDesc);
+ bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc);
Value *NewVecOp = State.get(getVecOp(), Part);
if (VPValue *Cond = getCondOp()) {
Value *NewCond = State.get(Cond, Part);
; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -enable-strict-reductions=false -hints-allow-reordering=false -S 2>%t | FileCheck %s --check-prefix=CHECK-NOT-VECTORIZED
; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -enable-strict-reductions=false -hints-allow-reordering=true -S 2>%t | FileCheck %s --check-prefix=CHECK-UNORDERED
; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -enable-strict-reductions=true -hints-allow-reordering=false -S 2>%t | FileCheck %s --check-prefix=CHECK-ORDERED
+; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -enable-strict-reductions=true -hints-allow-reordering=true -S 2>%t | FileCheck %s --check-prefix=CHECK-UNORDERED
define float @fadd_strict(float* noalias nocapture readonly %a, i64 %n) {
; CHECK-ORDERED-LABEL: @fadd_strict
; return sum;
;}
;
-; Note: These tests do not use metadata hints, and as such we should not expect the CHECK-UNORDERED case to vectorize, even
-; with the -hints-allow-reordering flag set to true.
; Strict reduction could be performed in-loop, but ordered FP induction variables are not supported
+; Note: This test does not use metadata hints, and as such we should not expect the CHECK-UNORDERED case to vectorize, even
+; with the -hints-allow-reordering flag set to true.
define float @induction_and_reduction(float* nocapture readonly %values, float %init, float* noalias nocapture %A, i64 %N) {
; CHECK-ORDERED-LABEL: @induction_and_reduction
; CHECK-ORDERED-NOT: vector.body
; CHECK-ORDERED: vector.body
; CHECK-ORDERED: %[[RDX_PHI:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[FADD2:.*]], %vector.body ]
; CHECK-ORDERED: %[[IND_PHI:.*]] = phi <4 x float> [ %[[INDUCTION]], %vector.ph ], [ %[[VEC_IND_NEXT:.*]], %vector.body ]
-; CHECK-ORDERED: %[[STEP_ADD:.*]] = fadd fast <4 x float> %[[IND_PHI]], <float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00>
; CHECK-ORDERED: %[[LOAD1:.*]] = load <4 x float>, <4 x float>*
-; CHECK-ORDERED: %[[LOAD2:.*]] = load <4 x float>, <4 x float>*
; CHECK-ORDERED: %[[FADD1:.*]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[RDX_PHI]], <4 x float> %[[LOAD1]])
-; CHECK-ORDERED: %[[FADD2]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[FADD1]], <4 x float> %[[LOAD2]])
-; CHECK-ORDERED: %[[VEC_IND_NEXT]] = fadd fast <4 x float> %[[STEP_ADD]], <float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00>
+; CHECK-ORDERED: %[[VEC_IND_NEXT]] = fadd fast <4 x float> %[[IND_PHI]], <float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00>
; CHECK-ORDERED: for.body
-; CHECK-ORDERED: %[[RDX_SUM_PHI:.*]] = phi float [ {{.*}}, %scalar.ph ], [ %[[FADD3:.*]], %for.body ]
+; CHECK-ORDERED: %[[RDX_SUM_PHI:.*]] = phi float [ {{.*}}, %scalar.ph ], [ %[[FADD2:.*]], %for.body ]
; CHECK-ORDERED: %[[IND_SUM_PHI:.*]] = phi fast float [ {{.*}}, %scalar.ph ], [ %[[ADD_IND:.*]], %for.body ]
; CHECK-ORDERED: store float %[[IND_SUM_PHI]], float*
; CHECK-ORDERED: %[[ADD_IND]] = fadd fast float %[[IND_SUM_PHI]], 2.000000e+00
-; CHECK-ORDERED: %[[LOAD3:.*]] = load float, float*
-; CHECK-ORDERED: %[[FADD3]] = fadd float %[[RDX_SUM_PHI]], %[[LOAD3]]
+; CHECK-ORDERED: %[[LOAD2:.*]] = load float, float*
+; CHECK-ORDERED: %[[FADD2]] = fadd float %[[RDX_SUM_PHI]], %[[LOAD2]]
; CHECK-ORDERED: for.end
-; CHECK-ORDERED: %[[RES_PHI:.*]] = phi float [ %[[FADD3]], %for.body ], [ %[[FADD2]], %middle.block ]
+; CHECK-ORDERED: %[[RES_PHI:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[FADD1]], %middle.block ]
; CHECK-ORDERED: ret float %[[RES_PHI]]
; CHECK-UNORDERED-LABEL: @fast_induction_and_reduction
-; CHECK-UNORDERED-NOT: vector.body
+; CHECK-UNORDERED: vector.ph
+; CHECK-UNORDERED: %[[INDUCTION:.*]] = fadd fast <4 x float> {{.*}}, <float 0.000000e+00, float 2.000000e+00, float 4.000000e+00, float 6.000000e+00>
+; CHECK-UNORDERED: vector.body
+; CHECK-UNORDERED: %[[RDX_PHI:.*]] = phi <4 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ %[[VEC_FADD:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[IND_PHI:.*]] = phi <4 x float> [ %[[INDUCTION]], %vector.ph ], [ %[[VEC_IND_NEXT:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[LOAD1:.*]] = load <4 x float>, <4 x float>*
+; CHECK-UNORDERED: %[[VEC_FADD]] = fadd <4 x float> %[[RDX_PHI]], %[[LOAD1]]
+; CHECK-UNORDERED: %[[VEC_IND_NEXT]] = fadd fast <4 x float> %[[IND_PHI]], <float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00>
+; CHECK-UNORDERED: middle.block:
+; CHECK-UNORDERED: %[[VEC_RDX:.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> %[[VEC_FADD]])
+; CHECK-UNORDERED: for.body:
+; CHECK-UNORDERED: %[[RDX_SUM_PHI:.*]] = phi float [ {{.*}}, %scalar.ph ], [ %[[FADD:.*]], %for.body ]
+; CHECK-UNORDERED: %[[IND_SUM_PHI:.*]] = phi fast float [ {{.*}}, %scalar.ph ], [ %[[ADD_IND:.*]], %for.body ]
+; CHECK-UNORDERED: store float %[[IND_SUM_PHI]], float*
+; CHECK-UNORDERED: %[[ADD_IND]] = fadd fast float %[[IND_SUM_PHI]], 2.000000e+00
+; CHECK-UNORDERED: %[[LOAD2:.*]] = load float, float*
+; CHECK-UNORDERED: %[[FADD]] = fadd float %[[RDX_SUM_PHI]], %[[LOAD2]]
+; CHECK-UNORDERED: for.end
+; CHECK-UNORDERED: %[[RES_PHI:.*]] = phi float [ %[[FADD]], %for.body ], [ %[[VEC_RDX]], %middle.block ]
+; CHECK-UNORDERED: ret float %[[RES_PHI]]
; CHECK-NOT-VECTORIZED-LABEL: @fast_induction_and_reduction
; CHECK-NOT-VECTORIZED-NOT: vector.body
%add3 = fadd float %sum.015, %0
%iv.next = add nuw nsw i64 %iv, 1
%exitcond.not = icmp eq i64 %iv.next, %N
- br i1 %exitcond.not, label %for.end, label %for.body
+ br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !2
for.end:
ret float %add3
}
; The FP induction is fast, but here we can't vectorize as only one of the reductions is an FAdd that can be performed in-loop
+; Note: This test does not use metadata hints, and as such we should not expect the CHECK-UNORDERED case to vectorize, even
+; with the -hints-allow-reordering flag set to true.
define float @fast_induction_unordered_reduction(float* nocapture readonly %values, float %init, float* noalias nocapture %A, float* noalias nocapture %B, i64 %N) {
; CHECK-ORDERED-LABEL: @fast_induction_unordered_reduction