/// Generate the IR code for the body of the vectorized loop according to the
/// best selected \p VF, \p UF and VPlan \p BestPlan.
+ /// TODO: \p IsEpilogueVectorization is needed to avoid issues due to epilogue
+ /// vectorization re-using plans for both the main and epilogue vector loops.
+ /// It should be removed once the re-use issue has been fixed.
void executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan,
- InnerLoopVectorizer &LB, DominatorTree *DT);
+ InnerLoopVectorizer &LB, DominatorTree *DT,
+ bool IsEpilogueVectorization);
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void printPlans(raw_ostream &O);
void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
VPlan &BestVPlan,
InnerLoopVectorizer &ILV,
- DominatorTree *DT) {
+ DominatorTree *DT,
+ bool IsEpilogueVectorization) {
LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF
<< '\n');
// 2. Copy and widen instructions from the old loop into the new loop.
BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr),
ILV.getOrCreateVectorTripCount(nullptr),
- CanonicalIVStartValue, State);
+ CanonicalIVStartValue, State,
+ IsEpilogueVectorization);
BestVPlan.execute(&State);
&CM, BFI, PSI, Checks);
LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
<< L->getHeader()->getParent()->getName() << "\"\n");
- LVP.executePlan(VF.Width, 1, BestPlan, LB, DT);
+ LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
}
// Mark the loop as already vectorized to avoid vectorizing again.
&CM, BFI, PSI, Checks);
VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
- LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT);
+ LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
ORE->emit([&]() {
return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV,
- DT);
+ DT, true);
++LoopsVectorized;
// Second pass vectorizes the epilogue and adjusts the control flow
}
LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
- DT);
+ DT, true);
++LoopsEpilogueVectorized;
if (!MainILV.areSafetyChecksAdded())
&LVL, &CM, BFI, PSI, Checks);
VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
- LVP.executePlan(VF.Width, IC, BestPlan, LB, DT);
+ LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
++LoopsVectorized;
// Add metadata to disable runtime unrolling a scalar loop when there
void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
Value *CanonicalIVStartValue,
- VPTransformState &State) {
+ VPTransformState &State,
+ bool IsEpilogueVectorization) {
VPBasicBlock *ExitingVPBB = getVectorLoopRegion()->getExitingBasicBlock();
auto *Term = dyn_cast<VPInstruction>(&ExitingVPBB->back());
// Try to simplify BranchOnCount to 'BranchOnCond true' if TC <= VF * UF when
// preparing to execute the plan for the main vector loop.
- if (!CanonicalIVStartValue && Term &&
+ if (!IsEpilogueVectorization && Term &&
Term->getOpcode() == VPInstruction::BranchOnCount &&
isa<ConstantInt>(TripCountV)) {
ConstantInt *C = cast<ConstantInt>(TripCountV);
/// Prepare the plan for execution, setting up the required live-in values.
void prepareToExecute(Value *TripCount, Value *VectorTripCount,
- Value *CanonicalIVStartValue, VPTransformState &State);
+ Value *CanonicalIVStartValue, VPTransformState &State,
+ bool IsEpilogueVectorization);
/// Generate the IR code for this VPlan.
void execute(struct VPTransformState *State);
define zeroext i8 @sum() {
; CHECK-LABEL: @sum(
; CHECK-NEXT: iter.check:
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [128 x i8], [128 x i8]* @bytes, i64 0, i64 0
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ITER_CHECK:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <64 x i8> [ zeroinitializer, [[ITER_CHECK]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <64 x i8> [ zeroinitializer, [[ITER_CHECK]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [128 x i8], [128 x i8]* @bytes, i64 0, i64 [[INDEX]]
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <64 x i8>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <64 x i8>, <64 x i8>* [[TMP1]], align 16
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 64
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <64 x i8>*
; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <64 x i8>, <64 x i8>* [[TMP3]], align 16
-; CHECK-NEXT: [[TMP4:%.*]] = add <64 x i8> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT: [[TMP5:%.*]] = add <64 x i8> [[WIDE_LOAD2]], zeroinitializer
-; CHECK-NEXT: [[INDEX_NEXT:%.*]] = add nuw i64 0, 128
+; CHECK-NEXT: [[TMP4]] = add <64 x i8> [[WIDE_LOAD]], [[VEC_PHI]]
+; CHECK-NEXT: [[TMP5]] = add <64 x i8> [[WIDE_LOAD2]], [[VEC_PHI1]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 128
+; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX]], 0
+; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: middle.block:
; CHECK-NEXT: [[BIN_RDX:%.*]] = add <64 x i8> [[TMP5]], [[TMP4]]
-; CHECK-NEXT: [[TMP6:%.*]] = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> [[BIN_RDX]])
-; CHECK-NEXT: ret i8 [[TMP6]]
+; CHECK-NEXT: [[TMP7:%.*]] = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> [[BIN_RDX]])
+; CHECK-NEXT: ret i8 [[TMP7]]
;
entry:
br label %for.body
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0
; CHECK-NEXT: store <32 x i8> [[STRIDED_VEC]], ptr [[TMP4]], align 1
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; CHECK-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK: vec.epilog.iter.check:
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; CHECK: vec.epilog.vector.body:
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT4:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [3 x i8], ptr [[SRC]], i64 [[TMP5]], i64 0
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 0
-; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <24 x i8>, ptr [[TMP7]], align 1
+; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [3 x i8], ptr [[SRC]], i64 [[TMP6]], i64 0
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 0
+; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <24 x i8>, ptr [[TMP8]], align 1
; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <24 x i8> [[WIDE_VEC2]], <24 x i8> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP5]]
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0
-; CHECK-NEXT: store <8 x i8> [[STRIDED_VEC3]], ptr [[TMP9]], align 1
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP6]]
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0
+; CHECK-NEXT: store <8 x i8> [[STRIDED_VEC3]], ptr [[TMP10]], align 1
; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[OFFSET_IDX]], 8
-; CHECK-NEXT: br i1 true, label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 24
+; CHECK-NEXT: br i1 [[TMP11]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
; CHECK: vec.epilog.middle.block:
; CHECK-NEXT: br label [[VEC_EPILOG_SCALAR_PH]]
; CHECK: vec.epilog.scalar.ph: