#include "llvm/Support/Debug.h"
#include "llvm/Support/KnownBits.h"
+#include <set>
+
using namespace llvm;
using namespace llvm::PatternMatch;
bool RecurrenceDescriptor::isFirstOrderRecurrence(
PHINode *Phi, Loop *TheLoop,
- DenseMap<Instruction *, Instruction *> &SinkAfter, DominatorTree *DT) {
+ MapVector<Instruction *, Instruction *> &SinkAfter, DominatorTree *DT) {
// Ensure the phi node is in the loop header and has two incoming values.
if (Phi->getParent() != TheLoop->getHeader() ||
SinkAfter.count(Previous)) // Cannot rely on dominance due to motion.
return false;
- // Ensure every user of the phi node is dominated by the previous value.
- // The dominance requirement ensures the loop vectorizer will not need to
- // vectorize the initial value prior to the first iteration of the loop.
- // TODO: Consider extending this sinking to handle memory instructions and
- // phis with multiple users.
-
- // Returns true, if all users of I are dominated by DominatedBy.
- auto allUsesDominatedBy = [DT](Instruction *I, Instruction *DominatedBy) {
- return all_of(I->uses(), [DT, DominatedBy](Use &U) {
- return DT->dominates(DominatedBy, U);
- });
+ // Ensure every user of the phi node (recursively) is dominated by the
+ // previous value. The dominance requirement ensures the loop vectorizer will
+ // not need to vectorize the initial value prior to the first iteration of the
+ // loop.
+ // TODO: Consider extending this sinking to handle memory instructions.
+
+ // We optimistically assume we can sink all users after Previous. Keep a set
+ // of instructions to sink after Previous ordered by dominance in the common
+ // basic block. It will be applied to SinkAfter if all users can be sunk.
+ auto CompareByComesBefore = [](const Instruction *A, const Instruction *B) {
+ return A->comesBefore(B);
};
+ std::set<Instruction *, decltype(CompareByComesBefore)> InstrsToSink(
+ CompareByComesBefore);
+
+ BasicBlock *PhiBB = Phi->getParent();
+ SmallVector<Instruction *, 8> WorkList;
+ auto TryToPushSinkCandidate = [&](Instruction *SinkCandidate) {
+ // Already sunk SinkCandidate.
+ if (SinkCandidate->getParent() == PhiBB &&
+ InstrsToSink.find(SinkCandidate) != InstrsToSink.end())
+ return true;
- if (Phi->hasOneUse()) {
- Instruction *I = Phi->user_back();
-
- // If the user of the PHI is also the incoming value, we potentially have a
- // reduction and which cannot be handled by sinking.
- if (Previous == I)
+ // Cyclic dependence.
+ if (Previous == SinkCandidate)
return false;
- // We cannot sink terminator instructions.
- if (I->getParent()->getTerminator() == I)
+ if (DT->dominates(Previous,
+ SinkCandidate)) // We already are good w/o sinking.
+ return true;
+
+ if (SinkCandidate->getParent() != PhiBB ||
+ SinkCandidate->mayHaveSideEffects() ||
+ SinkCandidate->mayReadFromMemory() || SinkCandidate->isTerminator())
return false;
// Do not try to sink an instruction multiple times (if multiple operands
// are first order recurrences).
// TODO: We can support this case, by sinking the instruction after the
// 'deepest' previous instruction.
- if (SinkAfter.find(I) != SinkAfter.end())
+ if (SinkAfter.find(SinkCandidate) != SinkAfter.end())
return false;
- if (DT->dominates(Previous, I)) // We already are good w/o sinking.
+ // If we reach a PHI node that is not dominated by Previous, we reached a
+ // header PHI. No need for sinking.
+ if (isa<PHINode>(SinkCandidate))
return true;
- // We can sink any instruction without side effects, as long as all users
- // are dominated by the instruction we are sinking after.
- if (I->getParent() == Phi->getParent() && !I->mayHaveSideEffects() &&
- allUsesDominatedBy(I, Previous)) {
- SinkAfter[I] = Previous;
- return true;
+ // Sink User tentatively and check its users
+ InstrsToSink.insert(SinkCandidate);
+ WorkList.push_back(SinkCandidate);
+ return true;
+ };
+
+ WorkList.push_back(Phi);
+ // Try to recursively sink instructions and their users after Previous.
+ while (!WorkList.empty()) {
+ Instruction *Current = WorkList.pop_back_val();
+ for (User *User : Current->users()) {
+ if (!TryToPushSinkCandidate(cast<Instruction>(User)))
+ return false;
}
}
- return allUsesDominatedBy(Phi, Previous);
+ // We can sink all users of Phi. Update the mapping.
+ for (Instruction *I : InstrsToSink) {
+ SinkAfter[I] = Previous;
+ Previous = I;
+ }
+ return true;
}
/// This function returns the identity element (or neutral element) for
ret void
}
-; FIXME: Currently we can only sink a single instruction. For the example below,
-; we also have to sink users.
-define void @cannot_sink_with_additional_user(i32 %x, i32* %ptr, i64 %tc) {
-; CHECK-LABEL: @cannot_sink_with_additional_user(
+; Sink users of %pre.phi recursively.
+define void @can_sink_with_additional_user(i32 %x, i32* %ptr, i64 %tc) {
+; CHECK-LABEL: @can_sink_with_additional_user(
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[PREHEADER:%.*]]
; CHECK: preheader:
; CHECK-NEXT: [[IDX_PHI_TRANS:%.*]] = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 1
; CHECK-NEXT: [[DOTPRE:%.*]] = load i32, i32* [[IDX_PHI_TRANS]], align 4
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i32 0
+; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i32> poison, i32 [[DOTPRE]], i32 3
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 [[TMP0]]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
+; CHECK-NEXT: [[WIDE_LOAD]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i32> [[TMP5]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i32> [[TMP5]], [[WIDE_LOAD]]
+; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i32> [[TMP6]], [[TMP7]]
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [257 x i32], [257 x i32]* @q, i64 0, i64 [[TMP0]]
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
+; CHECK-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* [[TMP11]], align 4
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1996
+; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1999, 1996
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 2
+; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[DOTPRE]], [[PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1997, [[MIDDLE_BLOCK]] ], [ 1, [[PREHEADER]] ]
; CHECK-NEXT: br label [[FOR:%.*]]
; CHECK: for:
-; CHECK-NEXT: [[PRE_PHI:%.*]] = phi i32 [ [[DOTPRE]], [[PREHEADER]] ], [ [[PRE_NEXT:%.*]], [[FOR]] ]
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 1, [[PREHEADER]] ], [ [[IV_NEXT:%.*]], [[FOR]] ]
-; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[PRE_PHI]], [[X:%.*]]
+; CHECK-NEXT: [[SCALAR_RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[PRE_NEXT:%.*]], [[FOR]] ]
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR]] ]
+; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[SCALAR_RECUR]], [[X]]
; CHECK-NEXT: [[ADD_2:%.*]] = add i32 [[ADD_1]], [[X]]
; CHECK-NEXT: [[IDX_1:%.*]] = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 [[IV]]
; CHECK-NEXT: [[PRE_NEXT]] = load i32, i32* [[IDX_1]], align 4
; CHECK-NEXT: store i32 [[ADD_4]], i32* [[IDX_2]], align 4
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 2000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[FOR]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[FOR]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK: exit:
; CHECK-NEXT: ret void
;
-
entry:
br label %preheader
ret void
}
-; Users that are phi nodes cannot be sunk.
+; The (first) reason `%first_time.1` cannot be sunk is because it appears outside
+; the header and is not dominated by Previous. The fact that it feeds Previous
+; is a second sinking-preventing reason.
define void @cannot_sink_phi(i32* %ptr) {
; CHECK-LABEL: @cannot_sink_phi(
; CHECK-NEXT: entry:
entry:
br label %loop.header
-loop.header: ; preds = %if.end128, %for.cond108.preheader
+loop.header:
%iv = phi i64 [ 1, %entry ], [ %iv.next, %loop.latch ]
%for = phi i32 [ 0, %entry ], [ %for.next, %loop.latch ]
%c.1 = icmp ult i64 %iv, 500
br i1 %c.1, label %if.truebb, label %if.falsebb
-if.truebb: ; preds = %for.body114
+if.truebb:
br label %loop.latch
-if.falsebb: ; preds = %for.body114
+if.falsebb:
br label %loop.latch
-loop.latch: ; preds = %if.then122, %for.body114.if.end128_crit_edge
+loop.latch:
%first_time.1 = phi i32 [ 20, %if.truebb ], [ %for, %if.falsebb ]
%c.2 = icmp ult i64 %iv, 800
%for.next = select i1 %c.2, i32 30, i32 %first_time.1
define void @sink_dominance(i32* %ptr, i32 %N) {
; CHECK-LABEL: @sink_dominance(
; CHECK-NEXT: entry:
+; CHECK-NEXT: [[UMAX1:%.*]] = call i32 @llvm.umax.i32(i32 [[N:%.*]], i32 1)
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[UMAX1]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK: vector.scevcheck:
+; CHECK-NEXT: [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[N]], i32 1)
+; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[UMAX]], -1
+; CHECK-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP0]])
+; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; CHECK-NEXT: [[TMP1:%.*]] = add i32 0, [[MUL_RESULT]]
+; CHECK-NEXT: [[TMP2:%.*]] = sub i32 0, [[MUL_RESULT]]
+; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt i32 [[TMP2]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = icmp slt i32 [[TMP1]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = select i1 false, i1 [[TMP3]], i1 [[TMP4]]
+; CHECK-NEXT: [[TMP6:%.*]] = or i1 [[TMP5]], [[MUL_OVERFLOW]]
+; CHECK-NEXT: [[TMP7:%.*]] = or i1 false, [[TMP6]]
+; CHECK-NEXT: br i1 [[TMP7]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[UMAX1]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[UMAX1]], [[N_MOD_VF]]
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 0>, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i32 [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4
+; CHECK-NEXT: [[TMP12]] = zext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> [[TMP12]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT: [[TMP14:%.*]] = trunc <4 x i64> [[TMP13]] to <4 x i32>
+; CHECK-NEXT: [[TMP15:%.*]] = icmp slt <4 x i32> [[TMP14]], <i32 213, i32 213, i32 213, i32 213>
+; CHECK-NEXT: [[TMP16:%.*]] = select <4 x i1> [[TMP15]], <4 x i32> [[TMP14]], <4 x i32> <i32 22, i32 22, i32 22, i32 22>
+; CHECK-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
+; CHECK-NEXT: store <4 x i32> [[TMP16]], <4 x i32>* [[TMP17]], align 4
+; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[UMAX1]], [[N_VEC]]
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP12]], i32 3
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[TMP12]], i32 2
+; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: loop:
-; CHECK-NEXT: [[FOR:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[FOR_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[FOR_TRUNC:%.*]] = trunc i64 [[FOR]] to i32
+; CHECK-NEXT: [[SCALAR_RECUR:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT: [[FOR_TRUNC:%.*]] = trunc i64 [[SCALAR_RECUR]] to i32
; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[FOR_TRUNC]], 213
; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[CMP]], i32 [[FOR_TRUNC]], i32 22
-; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i32 [[IV]]
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i32 [[IV]]
; CHECK-NEXT: [[LV:%.*]] = load i32, i32* [[GEP]], align 4
; CHECK-NEXT: [[FOR_NEXT]] = zext i32 [[LV]] to i64
; CHECK-NEXT: store i32 [[SELECT]], i32* [[GEP]], align 4
; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT: [[CMP73:%.*]] = icmp ugt i32 [[N:%.*]], [[IV_NEXT]]
-; CHECK-NEXT: br i1 [[CMP73]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK-NEXT: [[CMP73:%.*]] = icmp ugt i32 [[N]], [[IV_NEXT]]
+; CHECK-NEXT: br i1 [[CMP73]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP13:![0-9]+]]
; CHECK: exit:
; CHECK-NEXT: ret void
;
define void @sink_dominance_2(i32* %ptr, i32 %N) {
; CHECK-LABEL: @sink_dominance_2(
; CHECK-NEXT: entry:
+; CHECK-NEXT: [[UMAX1:%.*]] = call i32 @llvm.umax.i32(i32 [[N:%.*]], i32 1)
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[UMAX1]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK: vector.scevcheck:
+; CHECK-NEXT: [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[N]], i32 1)
+; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[UMAX]], -1
+; CHECK-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP0]])
+; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; CHECK-NEXT: [[TMP1:%.*]] = add i32 0, [[MUL_RESULT]]
+; CHECK-NEXT: [[TMP2:%.*]] = sub i32 0, [[MUL_RESULT]]
+; CHECK-NEXT: [[TMP3:%.*]] = icmp sgt i32 [[TMP2]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = icmp slt i32 [[TMP1]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = select i1 false, i1 [[TMP3]], i1 [[TMP4]]
+; CHECK-NEXT: [[TMP6:%.*]] = or i1 [[TMP5]], [[MUL_OVERFLOW]]
+; CHECK-NEXT: [[TMP7:%.*]] = or i1 false, [[TMP6]]
+; CHECK-NEXT: br i1 [[TMP7]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[UMAX1]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[UMAX1]], [[N_MOD_VF]]
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 0>, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i32 [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4
+; CHECK-NEXT: [[TMP12]] = zext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> [[TMP12]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT: [[TMP14:%.*]] = trunc <4 x i64> [[TMP13]] to <4 x i32>
+; CHECK-NEXT: [[TMP15:%.*]] = add <4 x i32> [[TMP14]], <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[TMP16:%.*]] = mul <4 x i32> [[TMP15]], <i32 99, i32 99, i32 99, i32 99>
+; CHECK-NEXT: [[TMP17:%.*]] = icmp slt <4 x i32> [[TMP14]], <i32 213, i32 213, i32 213, i32 213>
+; CHECK-NEXT: [[TMP18:%.*]] = select <4 x i1> [[TMP17]], <4 x i32> [[TMP14]], <4 x i32> [[TMP16]]
+; CHECK-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
+; CHECK-NEXT: store <4 x i32> [[TMP18]], <4 x i32>* [[TMP19]], align 4
+; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[UMAX1]], [[N_VEC]]
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP12]], i32 3
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[TMP12]], i32 2
+; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: loop:
-; CHECK-NEXT: [[FOR:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[FOR_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[FOR_TRUNC:%.*]] = trunc i64 [[FOR]] to i32
+; CHECK-NEXT: [[SCALAR_RECUR:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT: [[FOR_TRUNC:%.*]] = trunc i64 [[SCALAR_RECUR]] to i32
; CHECK-NEXT: [[STEP:%.*]] = add i32 [[FOR_TRUNC]], 2
; CHECK-NEXT: [[STEP_2:%.*]] = mul i32 [[STEP]], 99
; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[FOR_TRUNC]], 213
; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[CMP]], i32 [[FOR_TRUNC]], i32 [[STEP_2]]
-; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i32 [[IV]]
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i32 [[IV]]
; CHECK-NEXT: [[LV:%.*]] = load i32, i32* [[GEP]], align 4
; CHECK-NEXT: [[FOR_NEXT]] = zext i32 [[LV]] to i64
; CHECK-NEXT: store i32 [[SELECT]], i32* [[GEP]], align 4
; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT: [[CMP73:%.*]] = icmp ugt i32 [[N:%.*]], [[IV_NEXT]]
-; CHECK-NEXT: br i1 [[CMP73]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK-NEXT: [[CMP73:%.*]] = icmp ugt i32 [[N]], [[IV_NEXT]]
+; CHECK-NEXT: br i1 [[CMP73]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP15:![0-9]+]]
; CHECK: exit:
; CHECK-NEXT: ret void
;
; b[i] = ((a[i] + 2) * a[i + 1]);
; }
;
-; NO-SINK-AFTER-LABEL: no_sink_after
-; NO-SINK-AFTER-NOT: vector.ph:
-; NO-SINK-AFTER: }
+; SINK-AFTER-LABEL: @sink_after_with_multiple_users
+; SINK-AFTER-LABEL: vector.ph:
+; SINK-AFTER-NEXT: %n.mod.vf = urem i64 %n, 4
+; SINK-AFTER-NEXT: %n.vec = sub i64 %n, %n.mod.vf
+; SINK-AFTER-NEXT: %vector.recur.init = insertelement <4 x i16> poison, i16 %.pre, i32 3
+; SINK-AFTER-NEXT: br label %vector.body
+
+; SINK-AFTER-LABEL: vector.body:
+; SINK-AFTER-NEXT: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; SINK-AFTER-NEXT: %vector.recur = phi <4 x i16> [ %vector.recur.init, %vector.ph ], [ %wide.load, %vector.body ]
+; SINK-AFTER-NEXT: %1 = add i64 %index, 0
+; SINK-AFTER-NEXT: %2 = add nuw nsw i64 %1, 1
+; SINK-AFTER-NEXT: %3 = getelementptr inbounds i16, i16* %a, i64 %2
+; SINK-AFTER-NEXT: %4 = getelementptr inbounds i16, i16* %3, i32 0
+; SINK-AFTER-NEXT: %5 = bitcast i16* %4 to <4 x i16>*
+; SINK-AFTER-NEXT: %wide.load = load <4 x i16>, <4 x i16>* %5, align 2, !alias.scope !43
+; SINK-AFTER-NEXT: %6 = shufflevector <4 x i16> %vector.recur, <4 x i16> %wide.load, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; SINK-AFTER-NEXT: %7 = sext <4 x i16> %6 to <4 x i32>
+; SINK-AFTER-NEXT: %8 = add nsw <4 x i32> %7, <i32 2, i32 2, i32 2, i32 2>
+; SINK-AFTER-NEXT: %9 = sext <4 x i16> %wide.load to <4 x i32>
+; SINK-AFTER-NEXT: %10 = mul nsw <4 x i32> %8, %9
+; SINK-AFTER-NEXT: %11 = getelementptr inbounds i32, i32* %b, i64 %1
+; SINK-AFTER-NEXT: %12 = getelementptr inbounds i32, i32* %11, i32 0
+; SINK-AFTER-NEXT: %13 = bitcast i32* %12 to <4 x i32>*
+; SINK-AFTER-NEXT: store <4 x i32> %10, <4 x i32>* %13, align 4, !alias.scope !46, !noalias !43
+; SINK-AFTER-NEXT: %index.next = add i64 %index, 4
+; SINK-AFTER-NEXT: %14 = icmp eq i64 %index.next, %n.vec
+; SINK-AFTER-NEXT: br i1 %14, label %middle.block, label %vector.body, !llvm.loop !48
;
-define void @no_sink_after(i16* %a, i32* %b, i64 %n) {
+define void @sink_after_with_multiple_users(i16* %a, i32* %b, i64 %n) {
entry:
%.pre = load i16, i16* %a
br label %for.body
; SINK-AFTER-NEXT: %index.next = add i32 %index, 4
; SINK-AFTER-NEXT: %vec.ind.next = add <4 x i16> %vec.ind, <i16 4, i16 4, i16 4, i16 4>
; SINK-AFTER-NEXT: %6 = icmp eq i32 %index.next, 40
-; SINK-AFTER-NEXT: br i1 %6, label %middle.block, label %vector.body, !llvm.loop !43
+; SINK-AFTER-NEXT: br i1 %6, label %middle.block, label %vector.body, !llvm.loop !50
;
entry:
br label %for.cond
; CHECK-NEXT: [[TMP23]] = add <4 x i32> [[VEC_PHI1]], [[TMP22]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof !45, [[LOOP46:!llvm.loop !.*]]
+; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof !52, [[LOOP46:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: [[TMP25:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP23]], <4 x i32> [[VEC_PHI1]]
; CHECK-NEXT: [[TMP26:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP25]])
; CHECK-NEXT: [[TMP:%.*]] = phi i32 [ undef, [[BB2]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i32 [[TMP]]
; CHECK: bb2:
-; CHECK-NEXT: br i1 undef, label [[BB1]], label [[BB2]], !prof !47, [[LOOP48:!llvm.loop !.*]]
+; CHECK-NEXT: br i1 undef, label [[BB1]], label [[BB2]], !prof !54, [[LOOP48:!llvm.loop !.*]]
;
bb:
br label %bb2
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], <i32 4, i32 4, i32 4, i32 4>
; CHECK-NEXT: [[TMP39:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP39]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof !45, [[LOOP49:!llvm.loop !.*]]
+; CHECK-NEXT: br i1 [[TMP39]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof !52, [[LOOP49:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: [[TMP40:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP23]], <4 x i32> [[VEC_PHI4]]
; CHECK-NEXT: [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP40]])
; CHECK-NEXT: [[TMP:%.*]] = phi i32 [ undef, [[BB2]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i32 [[TMP]]
; CHECK: bb2:
-; CHECK-NEXT: br i1 undef, label [[BB1]], label [[BB2]], !prof !47, [[LOOP50:!llvm.loop !.*]]
+; CHECK-NEXT: br i1 undef, label [[BB1]], label [[BB2]], !prof !54, [[LOOP50:!llvm.loop !.*]]
;
bb:
br label %bb2