/// Returns true if we're required to use a scalar epilogue for at least
/// the final iteration of the original loop.
- bool requiresScalarEpilogue() const {
+ bool requiresScalarEpilogue(ElementCount VF) const {
if (!isScalarEpilogueAllowed())
return false;
// If we might exit from anywhere but the latch, must run the exiting
// iteration in scalar form.
if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
return true;
- return InterleaveInfo.requiresScalarEpilogue();
+ return VF.isVector() && InterleaveInfo.requiresScalarEpilogue();
}
/// Returns true if a scalar epilogue is not allowed due to optsize or a
// unroll factor (number of SIMD instructions).
Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
- // There are two cases where we need to ensure (at least) the last iteration
- // runs in the scalar remainder loop. Thus, if the step evenly divides
- // the trip count, we set the remainder to be equal to the step. If the step
- // does not evenly divide the trip count, no adjustment is necessary since
- // there will already be scalar iterations. Note that the minimum iterations
- // check ensures that N >= Step. The cases are:
- // 1) If there is a non-reversed interleaved group that may speculatively
- // access memory out-of-bounds.
- // 2) If any instruction may follow a conditionally taken exit. That is, if
- // the loop contains multiple exiting blocks, or a single exiting block
- // which is not the latch.
- if (VF.isVector() && Cost->requiresScalarEpilogue()) {
+ // There are cases where we *must* run at least one iteration in the remainder
+ // loop. See the cost model for when this can happen. If the step evenly
+ // divides the trip count, we set the remainder to be equal to the step. If
+ // the step does not evenly divide the trip count, no adjustment is necessary
+ // since there will already be scalar iterations. Note that the minimum
+ // iterations check ensures that N >= Step.
+ if (Cost->requiresScalarEpilogue(VF)) {
auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
R = Builder.CreateSelect(IsZero, Step, R);
}
// vector trip count is zero. This check also covers the case where adding one
// to the backedge-taken count overflowed leading to an incorrect trip count
// of zero. In this case we will also jump to the scalar loop.
- auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
- : ICmpInst::ICMP_ULT;
+ auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE
+ : ICmpInst::ICMP_ULT;
// If tail is to be folded, vector loop takes care of all iterations.
Value *CheckMinIters = Builder.getFalse();
// Generate code to check if the loop's trip count is less than VF * UF of the
// main vector loop.
- auto P =
- Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
+ auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ?
+ ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
Value *CheckMinIters = Builder.CreateICmp(
P, Count, ConstantInt::get(Count->getType(), VFactor * UFactor),
// Generate code to check if the loop's trip count is less than VF * UF of the
// vector epilogue loop.
- auto P =
- Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
+ auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ?
+ ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
Value *CheckMinIters = Builder.CreateICmp(
P, Count,
; RUN: opt %s -S -loop-vectorize -force-vector-interleave=2 | FileCheck %s
; Demonstrate a case where we unroll a loop, but don't vectorize it.
-; This currently reveals a miscompile. The original loop runs stores in
-; the latch block on iterations 0 to 1022, and exits when %indvars.iv = 1023.
-; Currently, the unrolled loop produced by the vectorizer runs the iteration
-; where %indvar.iv = 1023 in the vector.body loop before exiting. This results
-; in an out of bounds access..
+; The original loop runs stores in the latch block on iterations 0 to 1022,
+; and exits when %indvars.iv = 1023. (That is, it actually runs the stores
+; for an odd number of iterations.) If we unroll by two in the "vector.body"
+; loop, we must exit to the epilogue on iteration with %indvars.iv = 1022 to
+; avoid an out of bounds access.
define void @test(double* %data) {
; CHECK-LABEL: @test(
; CHECK-NEXT: store double [[TMP8]], double* [[TMP4]], align 8
; CHECK-NEXT: store double [[TMP9]], double* [[TMP5]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1022
; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, 1024
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, 1022
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1022, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_LATCH:%.*]] ]