EnableCodeSinking("instcombine-code-sinking", cl::desc("Enable code sinking"),
cl::init(true));
+static cl::opt<unsigned> MaxSinkNumUsers(
+ "instcombine-max-sink-users", cl::init(32),
+ cl::desc("Maximum number of undroppable users for instruction sinking"));
+
static cl::opt<unsigned> LimitMaxIterations(
"instcombine-max-iterations",
cl::desc("Limit the maximum number of instruction combining iterations"),
/// block.
static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock,
TargetLibraryInfo &TLI) {
- assert(I->getUniqueUndroppableUser() && "Invariants didn't hold!");
BasicBlock *SrcBlock = I->getParent();
// Cannot move control-flow-involving, volatile loads, vaarg, etc.
[this](Instruction *I) -> Optional<BasicBlock *> {
if (!EnableCodeSinking)
return None;
- auto *UserInst = cast_or_null<Instruction>(I->getUniqueUndroppableUser());
- if (!UserInst)
- return None;
BasicBlock *BB = I->getParent();
BasicBlock *UserParent = nullptr;
+ unsigned NumUsers = 0;
- // Special handling for Phi nodes - get the block the use occurs in.
- if (PHINode *PN = dyn_cast<PHINode>(UserInst)) {
- for (unsigned i = 0; i < PN->getNumIncomingValues(); i++) {
- if (PN->getIncomingValue(i) == I) {
- // Bail out if we have uses in different blocks. We don't do any
- // sophisticated analysis (i.e finding NearestCommonDominator of these
- // use blocks).
- if (UserParent && UserParent != PN->getIncomingBlock(i))
- return None;
- UserParent = PN->getIncomingBlock(i);
+ for (auto *U : I->users()) {
+ if (U->isDroppable())
+ continue;
+ if (NumUsers > MaxSinkNumUsers)
+ return None;
+
+ Instruction *UserInst = cast<Instruction>(U);
+ // Special handling for Phi nodes - get the block the use occurs in.
+ if (PHINode *PN = dyn_cast<PHINode>(UserInst)) {
+ for (unsigned i = 0; i < PN->getNumIncomingValues(); i++) {
+ if (PN->getIncomingValue(i) == I) {
+ // Bail out if we have uses in different blocks. We don't do any
+ // sophisticated analysis (i.e finding NearestCommonDominator of
+ // these use blocks).
+ if (UserParent && UserParent != PN->getIncomingBlock(i))
+ return None;
+ UserParent = PN->getIncomingBlock(i);
+ }
}
+ assert(UserParent && "expected to find user block!");
+ } else {
+ if (UserParent && UserParent != UserInst->getParent())
+ return None;
+ UserParent = UserInst->getParent();
}
- assert(UserParent && "expected to find user block!");
- } else
- UserParent = UserInst->getParent();
- // Try sinking to another block. If that block is unreachable, then do
- // not bother. SimplifyCFG should handle it.
- if (UserParent == BB || !DT.isReachableFromEntry(UserParent))
- return None;
+ // Make sure these checks are done only once, naturally we do the checks
+ // the first time we get the userparent, this will save compile time.
+ if (NumUsers == 0) {
+ // Try sinking to another block. If that block is unreachable, then do
+ // not bother. SimplifyCFG should handle it.
+ if (UserParent == BB || !DT.isReachableFromEntry(UserParent))
+ return None;
+
+ auto *Term = UserParent->getTerminator();
+ // See if the user is one of our successors that has only one
+ // predecessor, so that we don't have to split the critical edge.
+ // Another option where we can sink is a block that ends with a
+ // terminator that does not pass control to other block (such as
+ // return or unreachable or resume). In this case:
+ // - I dominates the User (by SSA form);
+ // - the User will be executed at most once.
+ // So sinking I down to User is always profitable or neutral.
+ if (UserParent->getUniquePredecessor() != BB && !succ_empty(Term))
+ return None;
+
+ assert(DT.dominates(BB, UserParent) && "Dominance relation broken?");
+ }
- auto *Term = UserParent->getTerminator();
- // See if the user is one of our successors that has only one
- // predecessor, so that we don't have to split the critical edge.
- // Another option where we can sink is a block that ends with a
- // terminator that does not pass control to other block (such as
- // return or unreachable or resume). In this case:
- // - I dominates the User (by SSA form);
- // - the User will be executed at most once.
- // So sinking I down to User is always profitable or neutral.
- if (UserParent->getUniquePredecessor() == BB || succ_empty(Term)) {
- assert(DT.dominates(BB, UserParent) && "Dominance relation broken?");
- return UserParent;
+ NumUsers++;
}
- return None;
+
+ // No user or only has droppable users.
+ if (!UserParent)
+ return None;
+
+ return UserParent;
};
auto OptBB = getOptionalSinkBlockForInst(I);
define void @matching_phi(i64 %a, float* %b, i1 %cond) {
; CHECK-LABEL: @matching_phi(
; CHECK-NEXT: entry:
+; CHECK-NEXT: br i1 [[COND:%.*]], label [[BB2:%.*]], label [[BB1:%.*]]
+; CHECK: bb1:
+; CHECK-NEXT: [[ADDB:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 2
+; CHECK-NEXT: br label [[BB3:%.*]]
+; CHECK: bb2:
; CHECK-NEXT: [[ADD_INT:%.*]] = add i64 [[A:%.*]], 1
; CHECK-NEXT: [[ADD:%.*]] = inttoptr i64 [[ADD_INT]] to float*
-; CHECK-NEXT: br i1 [[COND:%.*]], label [[BBB:%.*]], label [[A:%.*]]
-; CHECK: A:
-; CHECK-NEXT: [[ADDB:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 2
-; CHECK-NEXT: br label [[C:%.*]]
-; CHECK: Bbb:
; CHECK-NEXT: store float 1.000000e+01, float* [[ADD]], align 4
-; CHECK-NEXT: br label [[C]]
-; CHECK: C:
-; CHECK-NEXT: [[A_ADDR_03:%.*]] = phi float* [ [[ADDB]], [[A]] ], [ [[ADD]], [[BBB]] ]
+; CHECK-NEXT: br label [[BB3]]
+; CHECK: bb3:
+; CHECK-NEXT: [[A_ADDR_03:%.*]] = phi float* [ [[ADDB]], [[BB1]] ], [ [[ADD]], [[BB2]] ]
; CHECK-NEXT: [[I1:%.*]] = load float, float* [[A_ADDR_03]], align 4
; CHECK-NEXT: [[MUL_I:%.*]] = fmul float [[I1]], 4.200000e+01
; CHECK-NEXT: store float [[MUL_I]], float* [[A_ADDR_03]], align 4
%addb = getelementptr inbounds float, float* %b, i64 2
%addb.int = ptrtoint float* %addb to i64
- br i1 %cmp1, label %A, label %Bbb
-A:
- br label %C
-Bbb:
+ br i1 %cmp1, label %bb1, label %bb2
+bb1:
+ br label %bb3
+bb2:
store float 1.0e+01, float* %add, align 4
- br label %C
+ br label %bb3
-C:
- %a.addr.03 = phi float* [ %addb, %A ], [ %add, %Bbb ]
- %b.addr.02 = phi i64 [ %addb.int, %A ], [ %add.int, %Bbb ]
+bb3:
+ %a.addr.03 = phi float* [ %addb, %bb1 ], [ %add, %bb2 ]
+ %b.addr.02 = phi i64 [ %addb.int, %bb1 ], [ %add.int, %bb2 ]
%i0 = inttoptr i64 %b.addr.02 to float*
%i1 = load float, float* %i0, align 4
%mul.i = fmul float %i1, 4.200000e+01
; CHECK-LABEL: @no_matching_phi(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[ADD_INT:%.*]] = add i64 [[A:%.*]], 1
-; CHECK-NEXT: [[ADD:%.*]] = inttoptr i64 [[ADD_INT]] to float*
; CHECK-NEXT: [[ADDB:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 2
; CHECK-NEXT: br i1 [[COND:%.*]], label [[B:%.*]], label [[A:%.*]]
; CHECK: A:
; CHECK-NEXT: br label [[C:%.*]]
; CHECK: B:
+; CHECK-NEXT: [[ADDB_INT:%.*]] = ptrtoint float* [[ADDB]] to i64
+; CHECK-NEXT: [[ADD:%.*]] = inttoptr i64 [[ADD_INT]] to float*
; CHECK-NEXT: store float 1.000000e+01, float* [[ADD]], align 4
; CHECK-NEXT: br label [[C]]
; CHECK: C:
; CHECK-NEXT: [[A_ADDR_03:%.*]] = phi float* [ [[ADDB]], [[A]] ], [ [[ADD]], [[B]] ]
-; CHECK-NEXT: [[B_ADDR_02_PTR:%.*]] = phi float* [ [[ADD]], [[A]] ], [ [[ADDB]], [[B]] ]
-; CHECK-NEXT: [[I1:%.*]] = load float, float* [[B_ADDR_02_PTR]], align 4
+; CHECK-NEXT: [[B_ADDR_02:%.*]] = phi i64 [ [[ADD_INT]], [[A]] ], [ [[ADDB_INT]], [[B]] ]
+; CHECK-NEXT: [[I0:%.*]] = inttoptr i64 [[B_ADDR_02]] to float*
+; CHECK-NEXT: [[I1:%.*]] = load float, float* [[I0]], align 4
; CHECK-NEXT: [[MUL_I:%.*]] = fmul float [[I1]], 4.200000e+01
; CHECK-NEXT: store float [[MUL_I]], float* [[A_ADDR_03]], align 4
; CHECK-NEXT: ret void
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TEXT:%.*]] = alloca [1 x i8], align 1
; CHECK-NEXT: [[BUFF:%.*]] = alloca [1 x i8], align 1
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1 x i8], [1 x i8]* [[TEXT]], i64 0, i64 0
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x i8], [1 x i8]* [[BUFF]], i64 0, i64 0
; CHECK-NEXT: br i1 [[FLAG:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
; CHECK: if:
; CHECK-NEXT: br label [[BB2:%.*]]
; CHECK: bb2:
; CHECK-NEXT: br label [[BB3:%.*]]
; CHECK: bb3:
-; CHECK-NEXT: call void @llvm.dbg.declare(metadata [1 x i8]* [[TEXT]], [[META16:metadata !.*]], metadata !DIExpression()), [[DBG24:!dbg !.*]]
+; CHECK-NEXT: call void @llvm.dbg.declare(metadata [1 x i8]* [[TEXT]], metadata [[META16:![0-9]+]], metadata !DIExpression()), !dbg [[DBG24:![0-9]+]]
; CHECK-NEXT: br label [[FIN:%.*]]
; CHECK: else:
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1 x i8], [1 x i8]* [[TEXT]], i64 0, i64 0
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x i8], [1 x i8]* [[BUFF]], i64 0, i64 0
; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 1, i8* [[TMP0]])
; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 1, i8* [[TMP1]])
; CHECK-NEXT: call void @foo(i8* [[TMP1]], i8* [[TMP0]])
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TEXT:%.*]] = alloca [1 x i8], align 1
; CHECK-NEXT: [[BUFF:%.*]] = alloca [1 x i8], align 1
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1 x i8], [1 x i8]* [[TEXT]], i64 0, i64 0
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x i8], [1 x i8]* [[BUFF]], i64 0, i64 0
; CHECK-NEXT: br i1 [[FLAG:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
; CHECK: if:
; CHECK-NEXT: br label [[BB2:%.*]]
; CHECK: bb2:
; CHECK-NEXT: br label [[BB3:%.*]]
; CHECK: bb3:
-; CHECK-NEXT: call void @llvm.dbg.declare(metadata [1 x i8]* [[TEXT]], [[META16:metadata !.*]], metadata !DIExpression()), [[DBG24:!dbg !.*]]
+; CHECK-NEXT: call void @llvm.dbg.declare(metadata [1 x i8]* [[TEXT]], metadata [[META16:![0-9]+]], metadata !DIExpression()), !dbg [[DBG24:![0-9]+]]
; CHECK-NEXT: br label [[FIN:%.*]]
; CHECK: else:
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1 x i8], [1 x i8]* [[TEXT]], i64 0, i64 0
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x i8], [1 x i8]* [[BUFF]], i64 0, i64 0
; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 1, i8* nonnull [[TMP0]])
; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 1, i8* nonnull [[TMP1]])
; CHECK-NEXT: call void @foo(i8* nonnull [[TMP1]], i8* nonnull [[TMP0]])
; CHECK-NEXT: [[I:%.*]] = load i8, i8* @var_7, align 1
; CHECK-NEXT: [[I1:%.*]] = icmp eq i8 [[I]], -1
; CHECK-NEXT: [[I4:%.*]] = load i16, i16* @var_0, align 2
-; CHECK-NEXT: [[I8:%.*]] = sext i16 [[I4]] to i32
; CHECK-NEXT: br i1 [[I1]], label [[BB10:%.*]], label [[BB9:%.*]]
; CHECK: bb9:
; CHECK-NEXT: br label [[BB12:%.*]]
; CHECK-NEXT: [[STOREMERGE1:%.*]] = phi i32 [ [[I11]], [[BB10]] ], [ 1, [[BB9]] ]
; CHECK-NEXT: store i32 [[STOREMERGE1]], i32* getelementptr inbounds ([0 x i32], [0 x i32]* @arr_2, i64 0, i64 0), align 4
; CHECK-NEXT: store i16 [[I4]], i16* getelementptr inbounds ([0 x i16], [0 x i16]* @arr_4, i64 0, i64 0), align 2
+; CHECK-NEXT: [[I8:%.*]] = sext i16 [[I4]] to i32
; CHECK-NEXT: store i32 [[I8]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @arr_3, i64 0, i64 0), align 16
; CHECK-NEXT: store i32 [[STOREMERGE1]], i32* getelementptr inbounds ([0 x i32], [0 x i32]* @arr_2, i64 0, i64 1), align 4
; CHECK-NEXT: store i16 [[I4]], i16* getelementptr inbounds ([0 x i16], [0 x i16]* @arr_4, i64 0, i64 1), align 2
; CHECK-LABEL: @f(
; CHECK-NEXT: bb0:
; CHECK-NEXT: [[T12:%.*]] = alloca [2 x i32], align 8
-; CHECK-NEXT: [[T12_SUB:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[T12]], i16 0, i16 0
; CHECK-NEXT: br i1 [[COND:%.*]], label [[BB1:%.*]], label [[BB2:%.*]]
; CHECK: bb1:
; CHECK-NEXT: unreachable
; CHECK: bb2:
+; CHECK-NEXT: [[T12_SUB:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[T12]], i16 0, i16 0
; CHECK-NEXT: [[T9:%.*]] = load i16*, i16** @b, align 2
; CHECK-NEXT: store i16 0, i16* [[T9]], align 2
; CHECK-NEXT: [[T10:%.*]] = load i32, i32* [[T12_SUB]], align 8
define i32 @t6_twoshifts(i32 %x, i8 %shamt) {
; CHECK-LABEL: @t6_twoshifts(
; CHECK-NEXT: bb:
-; CHECK-NEXT: [[SHAMT_WIDE:%.*]] = sext i8 [[SHAMT:%.*]] to i32
; CHECK-NEXT: br label [[WORK:%.*]]
; CHECK: work:
; CHECK-NEXT: br label [[END:%.*]]
; CHECK: end:
+; CHECK-NEXT: [[SHAMT_WIDE:%.*]] = sext i8 [[SHAMT:%.*]] to i32
; CHECK-NEXT: [[N0:%.*]] = shl i32 [[X:%.*]], [[SHAMT_WIDE]]
; CHECK-NEXT: [[R:%.*]] = ashr i32 [[N0]], [[SHAMT_WIDE]]
; CHECK-NEXT: ret i32 [[R]]
}
define i32 @n12_twoshifts_and_extrause(i32 %x, i8 %shamt) {
; CHECK-LABEL: @n12_twoshifts_and_extrause(
-; CHECK-NEXT: [[SHAMT_WIDE:%.*]] = sext i8 [[SHAMT:%.*]] to i32
; CHECK-NEXT: br label [[WORK:%.*]]
; CHECK: work:
; CHECK-NEXT: br label [[END:%.*]]
; CHECK: end:
+; CHECK-NEXT: [[SHAMT_WIDE:%.*]] = sext i8 [[SHAMT:%.*]] to i32
; CHECK-NEXT: [[N0:%.*]] = shl i32 [[X:%.*]], [[SHAMT_WIDE]]
; CHECK-NEXT: [[R:%.*]] = ashr i32 [[N0]], [[SHAMT_WIDE]]
; CHECK-NEXT: call void @use32(i32 [[SHAMT_WIDE]])
declare void @abort()
declare { i64, i1 } @llvm.umul.with.overflow.i64(i64, i64)
declare void @dummy(i64)
-; Todo: Two uses in two different users of a single successor block. We can sink.
+; Two uses in two different users of a single successor block. We can sink.
define i64 @test8(i64 %c) {
; CHECK-LABEL: @test8(
; CHECK-NEXT: bb1:
; CHECK-NEXT: [[OVERFLOW:%.*]] = icmp ugt i64 [[C:%.*]], 2305843009213693951
-; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[OVERFLOW]], i64 0, i64 8
; CHECK-NEXT: br i1 [[OVERFLOW]], label [[ABORT:%.*]], label [[BB2:%.*]]
; CHECK: bb2:
-; CHECK-NEXT: call void @dummy(i64 [[SELECT]])
-; CHECK-NEXT: ret i64 [[SELECT]]
+; CHECK-NEXT: call void @dummy(i64 8)
+; CHECK-NEXT: ret i64 8
; CHECK: abort:
; CHECK-NEXT: call void @abort()
; CHECK-NEXT: unreachable
; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX2_EPIL]], align 4
; CHECK-NEXT: [[MUL_EPIL:%.*]] = mul nsw i32 [[TMP3]], [[TMP2]]
; CHECK-NEXT: [[ADD_EPIL:%.*]] = add nsw i32 [[MUL_EPIL]], [[C_010_UNR]]
-; CHECK-NEXT: [[INDVARS_IV_NEXT_EPIL:%.*]] = add nuw nsw i64 [[INDVARS_IV_UNR]], 1
; CHECK-NEXT: [[EPIL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 1
; CHECK-NEXT: br i1 [[EPIL_ITER_CMP_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT_EPILOG_LCSSA:%.*]], label [[FOR_BODY_EPIL_1:%.*]]
; CHECK: for.body.epil.1:
+; CHECK-NEXT: [[INDVARS_IV_NEXT_EPIL:%.*]] = add nuw nsw i64 [[INDVARS_IV_UNR]], 1
; CHECK-NEXT: [[ARRAYIDX_EPIL_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_EPIL]]
; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_EPIL_1]], align 4
; CHECK-NEXT: [[ARRAYIDX2_EPIL_1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV_NEXT_EPIL]]
; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX2_EPIL_1]], align 4
; CHECK-NEXT: [[MUL_EPIL_1:%.*]] = mul nsw i32 [[TMP5]], [[TMP4]]
; CHECK-NEXT: [[ADD_EPIL_1:%.*]] = add nsw i32 [[MUL_EPIL_1]], [[ADD_EPIL]]
-; CHECK-NEXT: [[INDVARS_IV_NEXT_EPIL_1:%.*]] = add nuw nsw i64 [[INDVARS_IV_UNR]], 2
; CHECK-NEXT: [[EPIL_ITER_CMP_1_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 2
; CHECK-NEXT: br i1 [[EPIL_ITER_CMP_1_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT_EPILOG_LCSSA]], label [[FOR_BODY_EPIL_2:%.*]]
; CHECK: for.body.epil.2:
+; CHECK-NEXT: [[INDVARS_IV_NEXT_EPIL_1:%.*]] = add nuw nsw i64 [[INDVARS_IV_UNR]], 2
; CHECK-NEXT: [[ARRAYIDX_EPIL_2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_EPIL_1]]
; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_EPIL_2]], align 4
; CHECK-NEXT: [[ARRAYIDX2_EPIL_2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV_NEXT_EPIL_1]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <16 x i32*> [[BROADCAST_SPLAT]], zeroinitializer
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[TMP0]] to <16 x i32>*
-; CHECK-NEXT: store <16 x i32> [[BROADCAST_SPLAT9]], <16 x i32>* [[TMP2]], align 4, !alias.scope !0, !noalias !3
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <16 x i32>*
+; CHECK-NEXT: store <16 x i32> [[BROADCAST_SPLAT9]], <16 x i32>* [[TMP1]], align 4, !alias.scope !0, !noalias !3
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[BROADCAST_SPLAT]], i32 4, <16 x i1> [[TMP1]], <16 x i32> undef), !alias.scope !3
-; CHECK-NEXT: [[PREDPHI:%.*]] = select <16 x i1> [[TMP1]], <16 x i32> [[WIDE_MASKED_GATHER]], <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 1>
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <16 x i32*> [[BROADCAST_SPLAT]], zeroinitializer
+; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[BROADCAST_SPLAT]], i32 4, <16 x i1> [[TMP3]], <16 x i32> undef), !alias.scope !3
+; CHECK-NEXT: [[PREDPHI:%.*]] = select <16 x i1> [[TMP3]], <16 x i32> [[WIDE_MASKED_GATHER]], <16 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 1>
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC]]
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <16 x i32> [[PREDPHI]], i64 15
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
; CHECK-NEXT: [[SMAX11:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
; CHECK-NEXT: [[N_VEC13:%.*]] = and i64 [[SMAX11]], 9223372036854775800
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT18:%.*]] = insertelement <8 x i32*> poison, i32* [[A]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT19:%.*]] = shufflevector <8 x i32*> [[BROADCAST_SPLATINSERT18]], <8 x i32*> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT20:%.*]] = insertelement <8 x i32> poison, i32 [[NTRUNC]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT21:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT20]], <8 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT17:%.*]] = insertelement <8 x i32*> poison, i32* [[A]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT18:%.*]] = shufflevector <8 x i32*> [[BROADCAST_SPLATINSERT17]], <8 x i32*> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT19:%.*]] = insertelement <8 x i32> poison, i32 [[NTRUNC]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT20:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT19]], <8 x i32> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; CHECK: vec.epilog.vector.body:
-; CHECK-NEXT: [[INDEX14:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX14]]
-; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <8 x i32*> [[BROADCAST_SPLAT19]], zeroinitializer
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP5]] to <8 x i32>*
-; CHECK-NEXT: store <8 x i32> [[BROADCAST_SPLAT21]], <8 x i32>* [[TMP7]], align 4
-; CHECK-NEXT: [[INDEX_NEXT15]] = add nuw i64 [[INDEX14]], 8
-; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT15]], [[N_VEC13]]
-; CHECK-NEXT: br i1 [[TMP8]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT23:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <8 x i32>*
+; CHECK-NEXT: store <8 x i32> [[BROADCAST_SPLAT20]], <8 x i32>* [[TMP6]], align 4
+; CHECK-NEXT: [[INDEX_NEXT23]] = add nuw i64 [[OFFSET_IDX]], 8
+; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT23]], [[N_VEC13]]
+; CHECK-NEXT: br i1 [[TMP7]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK: vec.epilog.middle.block:
-; CHECK-NEXT: [[WIDE_MASKED_GATHER22:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[BROADCAST_SPLAT19]], i32 4, <8 x i1> [[TMP6]], <8 x i32> undef)
-; CHECK-NEXT: [[PREDPHI23:%.*]] = select <8 x i1> [[TMP6]], <8 x i32> [[WIDE_MASKED_GATHER22]], <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 1>
-; CHECK-NEXT: [[CMP_N16:%.*]] = icmp eq i64 [[SMAX11]], [[N_VEC13]]
-; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[PREDPHI23]], i64 7
-; CHECK-NEXT: br i1 [[CMP_N16]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <8 x i32*> [[BROADCAST_SPLAT18]], zeroinitializer
+; CHECK-NEXT: [[WIDE_MASKED_GATHER21:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[BROADCAST_SPLAT18]], i32 4, <8 x i1> [[TMP8]], <8 x i32> undef)
+; CHECK-NEXT: [[PREDPHI22:%.*]] = select <8 x i1> [[TMP8]], <8 x i32> [[WIDE_MASKED_GATHER21]], <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 1>
+; CHECK-NEXT: [[CMP_N14:%.*]] = icmp eq i64 [[SMAX11]], [[N_VEC13]]
+; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[PREDPHI22]], i64 7
+; CHECK-NEXT: br i1 [[CMP_N14]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
; CHECK: vec.epilog.scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0
; CHECK-NEXT: br i1 [[TMP1]], label [[DOTLR_PH5_PREHEADER:%.*]], label [[DOTPREHEADER:%.*]]
; CHECK: .lr.ph5.preheader:
-; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[N]], -1
-; CHECK-NEXT: [[TMP3:%.*]] = zext i32 [[TMP2]] to i64
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
+; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT: [[TMP3:%.*]] = zext i32 [[TMP2]] to i64
; CHECK-NEXT: [[N_RND_UP:%.*]] = add nuw nsw i64 [[TMP3]], 4
; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N_RND_UP]], 8589934588
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP3]], i64 0
; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[N]], 0
; CHECK-NEXT: br i1 [[TMP17]], label [[DOT_CRIT_EDGE:%.*]], label [[DOTLR_PH_PREHEADER:%.*]]
; CHECK: .lr.ph.preheader:
-; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[N]], -1
-; CHECK-NEXT: [[TMP19:%.*]] = zext i32 [[TMP18]] to i64
; CHECK-NEXT: br i1 false, label [[SCALAR_PH8:%.*]], label [[VECTOR_PH10:%.*]]
; CHECK: vector.ph10:
+; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT: [[TMP19:%.*]] = zext i32 [[TMP18]] to i64
; CHECK-NEXT: [[N_RND_UP11:%.*]] = add nuw nsw i64 [[TMP19]], 4
; CHECK-NEXT: [[N_VEC13:%.*]] = and i64 [[N_RND_UP11]], 8589934588
; CHECK-NEXT: [[BROADCAST_SPLATINSERT18:%.*]] = insertelement <4 x i64> poison, i64 [[TMP19]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT19:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT18]], <4 x i64> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY9:%.*]]
; CHECK: vector.body9:
-; CHECK-NEXT: [[INDEX38:%.*]] = phi i64 [ 0, [[VECTOR_PH10]] ], [ [[INDEX_NEXT37:%.*]], [[PRED_STORE_CONTINUE36:%.*]] ]
-; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[I_0_LCSSA]], [[INDEX38]]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT27:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX38]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT28:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT27]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[VEC_IV:%.*]] = or <4 x i64> [[BROADCAST_SPLAT28]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK-NEXT: [[INDEX20:%.*]] = phi i64 [ 0, [[VECTOR_PH10]] ], [ [[INDEX_NEXT31:%.*]], [[PRED_STORE_CONTINUE30:%.*]] ]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[I_0_LCSSA]], [[INDEX20]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT21:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX20]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT22:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT21]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[VEC_IV:%.*]] = or <4 x i64> [[BROADCAST_SPLAT22]], <i64 0, i64 1, i64 2, i64 3>
; CHECK-NEXT: [[TMP20:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT19]]
; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i1> [[TMP20]], i64 0
-; CHECK-NEXT: br i1 [[TMP21]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30:%.*]]
+; CHECK-NEXT: br i1 [[TMP21]], label [[PRED_STORE_IF23:%.*]], label [[PRED_STORE_CONTINUE24:%.*]]
; CHECK: pred.store.if23:
; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[OFFSET_IDX]]
; CHECK-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4
; CHECK-NEXT: [[TMP26:%.*]] = and i32 [[TMP25]], [[TMP23]]
; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 [[OFFSET_IDX]]
; CHECK-NEXT: store i32 [[TMP26]], i32* [[TMP27]], align 4
-; CHECK-NEXT: br label [[PRED_STORE_CONTINUE30]]
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE24]]
; CHECK: pred.store.continue24:
; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i1> [[TMP20]], i64 1
-; CHECK-NEXT: br i1 [[TMP28]], label [[PRED_STORE_IF31:%.*]], label [[PRED_STORE_CONTINUE32:%.*]]
+; CHECK-NEXT: br i1 [[TMP28]], label [[PRED_STORE_IF25:%.*]], label [[PRED_STORE_CONTINUE26:%.*]]
; CHECK: pred.store.if25:
; CHECK-NEXT: [[TMP29:%.*]] = add i64 [[OFFSET_IDX]], 1
; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP29]]
; CHECK-NEXT: [[TMP34:%.*]] = and i32 [[TMP33]], [[TMP31]]
; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 [[TMP29]]
; CHECK-NEXT: store i32 [[TMP34]], i32* [[TMP35]], align 4
-; CHECK-NEXT: br label [[PRED_STORE_CONTINUE32]]
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE26]]
; CHECK: pred.store.continue26:
; CHECK-NEXT: [[TMP36:%.*]] = extractelement <4 x i1> [[TMP20]], i64 2
-; CHECK-NEXT: br i1 [[TMP36]], label [[PRED_STORE_IF33:%.*]], label [[PRED_STORE_CONTINUE34:%.*]]
+; CHECK-NEXT: br i1 [[TMP36]], label [[PRED_STORE_IF27:%.*]], label [[PRED_STORE_CONTINUE28:%.*]]
; CHECK: pred.store.if27:
; CHECK-NEXT: [[TMP37:%.*]] = add i64 [[OFFSET_IDX]], 2
; CHECK-NEXT: [[TMP38:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP37]]
; CHECK-NEXT: [[TMP42:%.*]] = and i32 [[TMP41]], [[TMP39]]
; CHECK-NEXT: [[TMP43:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 [[TMP37]]
; CHECK-NEXT: store i32 [[TMP42]], i32* [[TMP43]], align 4
-; CHECK-NEXT: br label [[PRED_STORE_CONTINUE34]]
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE28]]
; CHECK: pred.store.continue28:
; CHECK-NEXT: [[TMP44:%.*]] = extractelement <4 x i1> [[TMP20]], i64 3
-; CHECK-NEXT: br i1 [[TMP44]], label [[PRED_STORE_IF35:%.*]], label [[PRED_STORE_CONTINUE36]]
+; CHECK-NEXT: br i1 [[TMP44]], label [[PRED_STORE_IF29:%.*]], label [[PRED_STORE_CONTINUE30]]
; CHECK: pred.store.if29:
; CHECK-NEXT: [[TMP45:%.*]] = add i64 [[OFFSET_IDX]], 3
; CHECK-NEXT: [[TMP46:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 [[TMP45]]
; CHECK-NEXT: [[TMP50:%.*]] = and i32 [[TMP49]], [[TMP47]]
; CHECK-NEXT: [[TMP51:%.*]] = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 [[TMP45]]
; CHECK-NEXT: store i32 [[TMP50]], i32* [[TMP51]], align 4
-; CHECK-NEXT: br label [[PRED_STORE_CONTINUE36]]
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE30]]
; CHECK: pred.store.continue30:
-; CHECK-NEXT: [[INDEX_NEXT37]] = add i64 [[INDEX38]], 4
-; CHECK-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT37]], [[N_VEC13]]
+; CHECK-NEXT: [[INDEX_NEXT31]] = add i64 [[INDEX20]], 4
+; CHECK-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT31]], [[N_VEC13]]
; CHECK-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK7:%.*]], label [[VECTOR_BODY9]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: middle.block7:
; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[SCALAR_PH8]]
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[N:%.*]], 0
; CHECK-NEXT: br i1 [[TMP1]], label [[DOT_CRIT_EDGE:%.*]], label [[DOTLR_PH_PREHEADER:%.*]]
; CHECK: .lr.ph.preheader:
-; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[N]], -1
-; CHECK-NEXT: [[TMP3:%.*]] = zext i32 [[TMP2]] to i64
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
+; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT: [[TMP3:%.*]] = zext i32 [[TMP2]] to i64
; CHECK-NEXT: [[N_RND_UP:%.*]] = add nuw nsw i64 [[TMP3]], 4
; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N_RND_UP]], 8589934588
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP3]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE21:%.*]] ]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT15:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT14]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[VEC_IV:%.*]] = or <4 x i64> [[BROADCAST_SPLAT15]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE19:%.*]] ]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT12:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT13:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT12]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[VEC_IV:%.*]] = or <4 x i64> [[BROADCAST_SPLAT13]], <i64 0, i64 1, i64 2, i64 3>
; CHECK-NEXT: [[TMP4:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP4]], i64 0
; CHECK-NEXT: br i1 [[TMP5]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
; CHECK: pred.store.if:
; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i32, i32* [[P:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[NEXT_GEP10:%.*]] = getelementptr i32, i32* [[Q:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[NEXT_GEP10]], align 16
+; CHECK-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i32, i32* [[Q:%.*]], i64 [[INDEX]]
+; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[NEXT_GEP8]], align 16
; CHECK-NEXT: store i32 [[TMP6]], i32* [[NEXT_GEP]], align 16
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]]
; CHECK: pred.store.continue:
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP4]], i64 1
-; CHECK-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF16:%.*]], label [[PRED_STORE_CONTINUE17:%.*]]
+; CHECK-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF14:%.*]], label [[PRED_STORE_CONTINUE15:%.*]]
; CHECK: pred.store.if14:
; CHECK-NEXT: [[TMP8:%.*]] = or i64 [[INDEX]], 1
-; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i32, i32* [[P]], i64 [[TMP8]]
+; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i32, i32* [[P]], i64 [[TMP8]]
; CHECK-NEXT: [[TMP9:%.*]] = or i64 [[INDEX]], 1
-; CHECK-NEXT: [[NEXT_GEP11:%.*]] = getelementptr i32, i32* [[Q]], i64 [[TMP9]]
-; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[NEXT_GEP11]], align 16
-; CHECK-NEXT: store i32 [[TMP10]], i32* [[NEXT_GEP7]], align 16
-; CHECK-NEXT: br label [[PRED_STORE_CONTINUE17]]
+; CHECK-NEXT: [[NEXT_GEP9:%.*]] = getelementptr i32, i32* [[Q]], i64 [[TMP9]]
+; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[NEXT_GEP9]], align 16
+; CHECK-NEXT: store i32 [[TMP10]], i32* [[NEXT_GEP5]], align 16
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE15]]
; CHECK: pred.store.continue15:
; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i1> [[TMP4]], i64 2
-; CHECK-NEXT: br i1 [[TMP11]], label [[PRED_STORE_IF18:%.*]], label [[PRED_STORE_CONTINUE19:%.*]]
+; CHECK-NEXT: br i1 [[TMP11]], label [[PRED_STORE_IF16:%.*]], label [[PRED_STORE_CONTINUE17:%.*]]
; CHECK: pred.store.if16:
; CHECK-NEXT: [[TMP12:%.*]] = or i64 [[INDEX]], 2
-; CHECK-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i32, i32* [[P]], i64 [[TMP12]]
+; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i32, i32* [[P]], i64 [[TMP12]]
; CHECK-NEXT: [[TMP13:%.*]] = or i64 [[INDEX]], 2
-; CHECK-NEXT: [[NEXT_GEP12:%.*]] = getelementptr i32, i32* [[Q]], i64 [[TMP13]]
-; CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[NEXT_GEP12]], align 16
-; CHECK-NEXT: store i32 [[TMP14]], i32* [[NEXT_GEP8]], align 16
-; CHECK-NEXT: br label [[PRED_STORE_CONTINUE19]]
+; CHECK-NEXT: [[NEXT_GEP10:%.*]] = getelementptr i32, i32* [[Q]], i64 [[TMP13]]
+; CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[NEXT_GEP10]], align 16
+; CHECK-NEXT: store i32 [[TMP14]], i32* [[NEXT_GEP6]], align 16
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE17]]
; CHECK: pred.store.continue17:
; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i1> [[TMP4]], i64 3
-; CHECK-NEXT: br i1 [[TMP15]], label [[PRED_STORE_IF20:%.*]], label [[PRED_STORE_CONTINUE21]]
+; CHECK-NEXT: br i1 [[TMP15]], label [[PRED_STORE_IF18:%.*]], label [[PRED_STORE_CONTINUE19]]
; CHECK: pred.store.if18:
; CHECK-NEXT: [[TMP16:%.*]] = or i64 [[INDEX]], 3
-; CHECK-NEXT: [[NEXT_GEP9:%.*]] = getelementptr i32, i32* [[P]], i64 [[TMP16]]
+; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i32, i32* [[P]], i64 [[TMP16]]
; CHECK-NEXT: [[TMP17:%.*]] = or i64 [[INDEX]], 3
-; CHECK-NEXT: [[NEXT_GEP13:%.*]] = getelementptr i32, i32* [[Q]], i64 [[TMP17]]
-; CHECK-NEXT: [[TMP18:%.*]] = load i32, i32* [[NEXT_GEP13]], align 16
-; CHECK-NEXT: store i32 [[TMP18]], i32* [[NEXT_GEP9]], align 16
-; CHECK-NEXT: br label [[PRED_STORE_CONTINUE21]]
+; CHECK-NEXT: [[NEXT_GEP11:%.*]] = getelementptr i32, i32* [[Q]], i64 [[TMP17]]
+; CHECK-NEXT: [[TMP18:%.*]] = load i32, i32* [[NEXT_GEP11]], align 16
+; CHECK-NEXT: store i32 [[TMP18]], i32* [[NEXT_GEP7]], align 16
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE19]]
; CHECK: pred.store.continue19:
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE16:%.*]] ]
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[INDUCTION:%.*]] = or <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
-; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <4 x i64> [[INDUCTION]], <i64 257, i64 257, i64 257, i64 257>
+; CHECK-NEXT: [[VEC_IV:%.*]] = or <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <4 x i64> [[VEC_IV]], <i64 257, i64 257, i64 257, i64 257>
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i1> [[TMP1]], i64 0
; CHECK-NEXT: br i1 [[TMP2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
; CHECK: pred.store.if:
; DISABLED_MASKED_STRIDED-LABEL: @test2(
; DISABLED_MASKED_STRIDED-NEXT: entry:
; DISABLED_MASKED_STRIDED-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[NUMPOINTS:%.*]], 0
-; DISABLED_MASKED_STRIDED-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; DISABLED_MASKED_STRIDED: for.body.preheader:
+; DISABLED_MASKED_STRIDED-NEXT: br i1 [[CMP15]], label [[VECTOR_PH:%.*]], label [[FOR_END:%.*]]
+; DISABLED_MASKED_STRIDED: vector.ph:
; DISABLED_MASKED_STRIDED-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[NUMPOINTS]] to i64
; DISABLED_MASKED_STRIDED-NEXT: [[N_RND_UP:%.*]] = add nuw nsw i64 [[WIDE_TRIP_COUNT]], 3
; DISABLED_MASKED_STRIDED-NEXT: [[N_VEC:%.*]] = and i64 [[N_RND_UP]], 8589934588
; DISABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
; DISABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]]
; DISABLED_MASKED_STRIDED: vector.body:
-; DISABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE15:%.*]] ]
-; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[FOR_BODY_PREHEADER]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE15]] ]
+; DISABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE15:%.*]] ]
+; DISABLED_MASKED_STRIDED-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE15]] ]
; DISABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = icmp ule <4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
; DISABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i64 [[INDEX]]
; DISABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = bitcast i16* [[TMP1]] to <4 x i16>*
; ENABLED_MASKED_STRIDED-LABEL: @test2(
; ENABLED_MASKED_STRIDED-NEXT: entry:
; ENABLED_MASKED_STRIDED-NEXT: [[CMP15:%.*]] = icmp sgt i32 [[NUMPOINTS:%.*]], 0
-; ENABLED_MASKED_STRIDED-NEXT: br i1 [[CMP15]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; ENABLED_MASKED_STRIDED: for.body.preheader:
+; ENABLED_MASKED_STRIDED-NEXT: br i1 [[CMP15]], label [[VECTOR_PH:%.*]], label [[FOR_END:%.*]]
+; ENABLED_MASKED_STRIDED: vector.ph:
; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[NUMPOINTS]] to i64
; ENABLED_MASKED_STRIDED-NEXT: [[N_RND_UP:%.*]] = add nuw nsw i64 [[WIDE_TRIP_COUNT]], 3
; ENABLED_MASKED_STRIDED-NEXT: [[N_VEC:%.*]] = and i64 [[N_RND_UP]], 8589934588
; ENABLED_MASKED_STRIDED-NEXT: [[TMP0:%.*]] = getelementptr inbounds i16, i16* [[POINTS:%.*]], i64 -1
; ENABLED_MASKED_STRIDED-NEXT: br label [[VECTOR_BODY:%.*]]
; ENABLED_MASKED_STRIDED: vector.body:
-; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; ENABLED_MASKED_STRIDED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0
; ENABLED_MASKED_STRIDED-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
-; ENABLED_MASKED_STRIDED-NEXT: [[INDUCTION:%.*]] = or <4 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3>
-; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = icmp ule <4 x i64> [[INDUCTION]], [[BROADCAST_SPLAT]]
+; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IV:%.*]] = or <4 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3>
+; ENABLED_MASKED_STRIDED-NEXT: [[TMP1:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
; ENABLED_MASKED_STRIDED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i64 [[INDEX]]
; ENABLED_MASKED_STRIDED-NEXT: [[TMP3:%.*]] = bitcast i16* [[TMP2]] to <4 x i16>*
; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TMP3]], i32 2, <4 x i1> [[TMP1]], <4 x i16> poison)
;
; CHECK-LABEL: @sink_into_replication_region(
; CHECK-NEXT: bb:
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[Y:%.*]], 1
; CHECK-NEXT: [[SMIN:%.*]] = call i32 @llvm.smin.i32(i32 [[Y]], i32 1)
; CHECK-NEXT: [[TMP1:%.*]] = sub i32 [[TMP0]], [[SMIN]]
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP1]], 3
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4
; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[TMP1]], -1
;
; UNROLL-LABEL: @sink_into_replication_region(
; UNROLL-NEXT: bb:
+; UNROLL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; UNROLL: vector.ph:
; UNROLL-NEXT: [[TMP0:%.*]] = add i32 [[Y:%.*]], 1
; UNROLL-NEXT: [[SMIN:%.*]] = call i32 @llvm.smin.i32(i32 [[Y]], i32 1)
; UNROLL-NEXT: [[TMP1:%.*]] = sub i32 [[TMP0]], [[SMIN]]
-; UNROLL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; UNROLL: vector.ph:
; UNROLL-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP1]], 7
; UNROLL-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
; UNROLL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[TMP1]], -1
;
; CHECK-LABEL: @sink_into_replication_region_multiple(
; CHECK-NEXT: bb:
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[Y:%.*]], 1
; CHECK-NEXT: [[SMIN:%.*]] = call i32 @llvm.smin.i32(i32 [[Y]], i32 1)
; CHECK-NEXT: [[TMP1:%.*]] = sub i32 [[TMP0]], [[SMIN]]
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP1]], 3
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -4
; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[TMP1]], -1
;
; UNROLL-LABEL: @sink_into_replication_region_multiple(
; UNROLL-NEXT: bb:
+; UNROLL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; UNROLL: vector.ph:
; UNROLL-NEXT: [[TMP0:%.*]] = add i32 [[Y:%.*]], 1
; UNROLL-NEXT: [[SMIN:%.*]] = call i32 @llvm.smin.i32(i32 [[Y]], i32 1)
; UNROLL-NEXT: [[TMP1:%.*]] = sub i32 [[TMP0]], [[SMIN]]
-; UNROLL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; UNROLL: vector.ph:
; UNROLL-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP1]], 7
; UNROLL-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
; UNROLL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = add i32 [[TMP1]], -1
; INTERLEAVE-NEXT: entry:
; INTERLEAVE-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 8)
; INTERLEAVE-NEXT: [[TMP0:%.*]] = add nsw i64 [[SMAX]], -1
-; INTERLEAVE-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 3
-; INTERLEAVE-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
; INTERLEAVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 64
; INTERLEAVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; INTERLEAVE: vector.ph:
+; INTERLEAVE-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 3
+; INTERLEAVE-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
; INTERLEAVE-NEXT: [[N_MOD_VF:%.*]] = and i64 [[TMP2]], 7
; INTERLEAVE-NEXT: [[TMP3:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
; INTERLEAVE-NEXT: [[TMP4:%.*]] = select i1 [[TMP3]], i64 8, i64 [[N_MOD_VF]]
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[NEXT_GEP]] to <12 x i32>*
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <12 x i32>, <12 x i32>* [[TMP1]], align 4
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
-; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
-; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[STRIDED_VEC2]], <i32 2, i32 2, i32 2, i32 2>
-; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[STRIDED_VEC3]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[STRIDED_VEC5]], <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[STRIDED_VEC6]], <i32 3, i32 3, i32 3, i32 3>
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 [[INDEX]], i32 2
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 -2
; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <12 x i32>*
; CHECK-NEXT: entry:
; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 2)
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[UMAX]], -1
-; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 1
-; CHECK-NEXT: [[TMP2:%.*]] = add nuw i64 [[TMP1]], 1
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
+; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = add nuw i64 [[TMP1]], 1
; CHECK-NEXT: [[N_MOD_VF:%.*]] = and i64 [[TMP2]], 3
; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP3]], i64 4, i64 [[N_MOD_VF]]
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[NEXT_GEP]] to <12 x i32>*
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <12 x i32>, <12 x i32>* [[TMP1]], align 4
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
-; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
-; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[STRIDED_VEC]], [[VEC_IND]]
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[NEXT_GEP]], i64 2
-; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[STRIDED_VEC2]], [[VEC_IND]]
-; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[STRIDED_VEC3]], [[VEC_IND]]
+; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[STRIDED_VEC5]], [[VEC_IND]]
+; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[STRIDED_VEC6]], [[VEC_IND]]
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i64 -2
; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <12 x i32>*
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
define void @multiply_dont_hoist_phi(<4 x double>* noalias %A, <4 x double> * %B, [4 x double]* %C) {
; CHECK-LABEL: @multiply_dont_hoist_phi(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast <4 x double>* [[A:%.*]] to <2 x double>*
-; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST]], align 8
-; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr <4 x double>, <4 x double>* [[A]], i64 0, i64 2
+; CHECK-NEXT: br label [[NEXT:%.*]]
+; CHECK: next:
+; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr <4 x double>, <4 x double>* [[A:%.*]], i64 0, i64 2
; CHECK-NEXT: [[VEC_CAST1:%.*]] = bitcast double* [[VEC_GEP]] to <2 x double>*
; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST1]], align 8
-; CHECK-NEXT: [[VEC_CAST3:%.*]] = bitcast <4 x double>* [[B:%.*]] to <2 x double>*
-; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST3]], align 8
-; CHECK-NEXT: [[VEC_GEP5:%.*]] = getelementptr <4 x double>, <4 x double>* [[B]], i64 0, i64 2
+; CHECK-NEXT: [[VEC_GEP5:%.*]] = getelementptr <4 x double>, <4 x double>* [[B:%.*]], i64 0, i64 2
; CHECK-NEXT: [[VEC_CAST6:%.*]] = bitcast double* [[VEC_GEP5]] to <2 x double>*
; CHECK-NEXT: [[COL_LOAD7:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST6]], align 8
-; CHECK-NEXT: br label [[NEXT:%.*]]
-; CHECK: next:
; CHECK-NEXT: [[SPLAT_SPLAT16:%.*]] = shufflevector <2 x double> [[COL_LOAD7]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
+; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast <4 x double>* [[A]] to <2 x double>*
+; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST]], align 8
; CHECK-NEXT: [[SPLAT_SPLAT13:%.*]] = shufflevector <2 x double> [[COL_LOAD7]], <2 x double> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP0:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT13]]
; CHECK-NEXT: [[TMP1:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD2]], <2 x double> [[SPLAT_SPLAT16]], <2 x double> [[TMP0]])
+; CHECK-NEXT: [[VEC_CAST3:%.*]] = bitcast <4 x double>* [[B]] to <2 x double>*
+; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST3]], align 8
; CHECK-NEXT: [[SPLAT_SPLAT10:%.*]] = shufflevector <2 x double> [[COL_LOAD4]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD4]], <2 x double> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP2:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]]
; CHECK-NEXT: br label [[BB1]]
; CHECK: bb1:
; CHECK-NEXT: [[J0:%.*]] = load i32, i32* [[J:%.*]], align 4
-; CHECK-NEXT: [[V6:%.*]] = and i32 [[I0]], 2
-; CHECK-NEXT: [[V4:%.*]] = icmp eq i32 [[V6]], [[J0]]
-; CHECK-NEXT: [[V8:%.*]] = add i32 [[SUM0:%.*]], 43
-; CHECK-NEXT: [[SUM2:%.*]] = select i1 [[V4]], i32 [[SUM0]], i32 [[V8]], !prof [[PROF16]]
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: [[V9:%.*]] = and i32 [[I0]], 4
; CHECK-NEXT: [[V10:%.*]] = icmp eq i32 [[V9]], 0
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[BB3]]
; CHECK: bb3:
+; CHECK-NEXT: [[V6:%.*]] = and i32 [[I0]], 2
+; CHECK-NEXT: [[V4:%.*]] = icmp eq i32 [[V6]], [[J0]]
+; CHECK-NEXT: [[V8:%.*]] = add i32 [[SUM0:%.*]], 43
+; CHECK-NEXT: [[SUM2:%.*]] = select i1 [[V4]], i32 [[SUM0]], i32 [[V8]], !prof [[PROF16]]
; CHECK-NEXT: [[V5:%.*]] = icmp eq i32 [[I0]], [[SUM2]]
; CHECK-NEXT: [[SUM3:%.*]] = select i1 [[V5]], i32 [[SUM2]], i32 [[V8]], !prof [[PROF16]]
; CHECK-NEXT: [[V11:%.*]] = add i32 [[I0]], [[SUM3]]