From d92cec4c96eb85096cc64643764e1370fc08eafc Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Tue, 17 May 2022 17:44:54 +0100
Subject: [PATCH] [LV] Regenerate check lines for some tests.

Make sure the auto-generated check lines are up-to-date for some files,
to reduce the test diff in upcoming changes
---
 .../AArch64/sve-epilog-vect-inloop-reductions.ll   |   6 +-
 .../AArch64/sve-epilog-vect-reductions.ll          |   8 +-
 .../AArch64/sve-epilog-vect-strict-reductions.ll   |   8 +-
 .../PowerPC/optimal-epilog-vectorization.ll        | 600 ++++++-------
 .../LoopVectorize/X86/invariant-load-gather.ll     |  32 +-
 .../LoopVectorize/X86/limit-vf-by-tripcount.ll     |  16 +-
 .../LoopVectorize/X86/masked_load_store.ll         | 952 ++++++++++-----------
 .../epilog-vectorization-reductions.ll             |  50 +-
 .../optimal-epilog-vectorization-liveout.ll        |  24 +-
 9 files changed, 848 insertions(+), 848 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll
index b8f941a..a1b0e43 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll
@@ -64,16 +64,16 @@ define i64 @int_reduction_and(i64* noalias nocapture %a, i64 %N) {
 ; CHECK-NEXT:    [[N_VEC6:%.*]] = sub i64 [[N]], [[N_MOD_VF5]]
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[INDEX8:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI9:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[TMP30:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP25:%.*]] = add i64 [[INDEX8]], 0
+; CHECK-NEXT:    [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP25]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, i64* [[TMP26]], i32 0
 ; CHECK-NEXT:    [[TMP28:%.*]] = bitcast i64* [[TMP27]] to <2 x i64>*
 ; CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <2 x i64>, <2 x i64>* [[TMP28]], align 4
 ; CHECK-NEXT:    [[TMP29:%.*]] = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> [[WIDE_LOAD10]])
 ; CHECK-NEXT:    [[TMP30]] = and i64 [[TMP29]], [[VEC_PHI9]]
-; CHECK-NEXT:    [[INDEX_NEXT11]] = add nuw i64 [[INDEX8]], 2
+; CHECK-NEXT:    [[INDEX_NEXT11]] = add nuw i64 [[OFFSET_IDX]], 2
 ; CHECK-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC6]]
 ; CHECK-NEXT:    br i1 [[TMP31]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll
index e3e219c..7b2cbca 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll
@@ -64,15 +64,15 @@ define i64 @int_reduction_add(i64* %a, i64 %N) {
 ; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <2 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[INDEX8:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI9:%.*]] = phi <2 x i64> [ [[TMP24]], [[VEC_EPILOG_PH]] ], [ [[TMP29:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP25:%.*]] = add i64 [[INDEX8]], 0
+; CHECK-NEXT:    [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP25]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, i64* [[TMP26]], i32 0
 ; CHECK-NEXT:    [[TMP28:%.*]] = bitcast i64* [[TMP27]] to <2 x i64>*
 ; CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <2 x i64>, <2 x i64>* [[TMP28]], align 4
 ; CHECK-NEXT:    [[TMP29]] = add <2 x i64> [[WIDE_LOAD10]], [[VEC_PHI9]]
-; CHECK-NEXT:    [[INDEX_NEXT11]] = add nuw i64 [[INDEX8]], 2
+; CHECK-NEXT:    [[INDEX_NEXT11]] = add nuw i64 [[OFFSET_IDX]], 2
 ; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC6]]
 ; CHECK-NEXT:    br i1 [[TMP30]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
@@ -91,7 +91,7 @@ define i64 @int_reduction_add(i64* %a, i64 %N) {
 ; CHECK-NEXT:    [[ADD]] = add i64 [[TMP32]], [[SUM]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       for.end.loopexit:
 ; CHECK-NEXT:    [[ADD_LCSSA4:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP31]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_END]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll
index a32a672..61b4feb 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll
@@ -60,15 +60,15 @@ define float @fadd_strict(float* noalias nocapture readonly %a, i64 %n) {
 ; CHECK-NEXT:    [[N_VEC5:%.*]] = sub i64 [[N]], [[N_MOD_VF4]]
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[INDEX7:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT10:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT10:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI8:%.*]] = phi float [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[TMP27:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[INDEX7]], 0
+; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP23]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds float, float* [[TMP24]], i32 0
 ; CHECK-NEXT:    [[TMP26:%.*]] = bitcast float* [[TMP25]] to <2 x float>*
 ; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <2 x float>, <2 x float>* [[TMP26]], align 4
 ; CHECK-NEXT:    [[TMP27]] = call float @llvm.vector.reduce.fadd.v2f32(float [[VEC_PHI8]], <2 x float> [[WIDE_LOAD9]])
-; CHECK-NEXT:    [[INDEX_NEXT10]] = add nuw i64 [[INDEX7]], 2
+; CHECK-NEXT:    [[INDEX_NEXT10]] = add nuw i64 [[OFFSET_IDX]], 2
 ; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT10]], [[N_VEC5]]
 ; CHECK-NEXT:    br i1 [[TMP28]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
@@ -86,7 +86,7 @@ define float @fadd_strict(float* noalias nocapture readonly %a, i64 %n) {
 ; CHECK-NEXT:    [[ADD]] = fadd float [[TMP29]], [[SUM_07]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       for.end.loopexit:
 ; CHECK-NEXT:    [[ADD_LCSSA3:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP27]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_END]]
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll
index 5084720..d71bbe2 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll
@@ -207,27 +207,27 @@ define dso_local void @f1(float* noalias %aa, float* noalias %bb, float* noalias
 ; VF-TWO-CHECK-NEXT:    [[N_VEC26:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF25]]
 ; VF-TWO-CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; VF-TWO-CHECK:       vec.epilog.vector.body:
-; VF-TWO-CHECK-NEXT:    [[INDEX27:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT28:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; VF-TWO-CHECK-NEXT:    [[TMP133:%.*]] = add i64 [[INDEX27]], 0
+; VF-TWO-CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT31:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; VF-TWO-CHECK-NEXT:    [[TMP133:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; VF-TWO-CHECK-NEXT:    [[TMP134:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP133]]
 ; VF-TWO-CHECK-NEXT:    [[TMP135:%.*]] = getelementptr inbounds float, float* [[TMP134]], i32 0
 ; VF-TWO-CHECK-NEXT:    [[TMP136:%.*]] = bitcast float* [[TMP135]] to <2 x float>*
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD30:%.*]] = load <2 x float>, <2 x float>* [[TMP136]], align 4
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD29:%.*]] = load <2 x float>, <2 x float>* [[TMP136]], align 4
 ; VF-TWO-CHECK-NEXT:    [[TMP137:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP133]]
 ; VF-TWO-CHECK-NEXT:    [[TMP138:%.*]] = getelementptr inbounds float, float* [[TMP137]], i32 0
 ; VF-TWO-CHECK-NEXT:    [[TMP139:%.*]] = bitcast float* [[TMP138]] to <2 x float>*
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD31:%.*]] = load <2 x float>, <2 x float>* [[TMP139]], align 4
-; VF-TWO-CHECK-NEXT:    [[TMP140:%.*]] = fadd fast <2 x float> [[WIDE_LOAD30]], [[WIDE_LOAD31]]
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD30:%.*]] = load <2 x float>, <2 x float>* [[TMP139]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP140:%.*]] = fadd fast <2 x float> [[WIDE_LOAD29]], [[WIDE_LOAD30]]
 ; VF-TWO-CHECK-NEXT:    [[TMP141:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP133]]
 ; VF-TWO-CHECK-NEXT:    [[TMP142:%.*]] = getelementptr inbounds float, float* [[TMP141]], i32 0
 ; VF-TWO-CHECK-NEXT:    [[TMP143:%.*]] = bitcast float* [[TMP142]] to <2 x float>*
 ; VF-TWO-CHECK-NEXT:    store <2 x float> [[TMP140]], <2 x float>* [[TMP143]], align 4
-; VF-TWO-CHECK-NEXT:    [[INDEX_NEXT28]] = add nuw i64 [[INDEX27]], 2
-; VF-TWO-CHECK-NEXT:    [[TMP144:%.*]] = icmp eq i64 [[INDEX_NEXT28]], [[N_VEC26]]
+; VF-TWO-CHECK-NEXT:    [[INDEX_NEXT31]] = add nuw i64 [[OFFSET_IDX]], 2
+; VF-TWO-CHECK-NEXT:    [[TMP144:%.*]] = icmp eq i64 [[INDEX_NEXT31]], [[N_VEC26]]
 ; VF-TWO-CHECK-NEXT:    br i1 [[TMP144]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOPID_EV:![0-9]+]]
 ; VF-TWO-CHECK:       vec.epilog.middle.block:
-; VF-TWO-CHECK-NEXT:    [[CMP_N29:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC26]]
-; VF-TWO-CHECK-NEXT:    br i1 [[CMP_N29]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
+; VF-TWO-CHECK-NEXT:    [[CMP_N27:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC26]]
+; VF-TWO-CHECK-NEXT:    br i1 [[CMP_N27]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
 ; VF-TWO-CHECK:       vec.epilog.scalar.ph:
 ; VF-TWO-CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC26]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ]
 ; VF-TWO-CHECK-NEXT:    br label [[FOR_BODY:%.*]]
@@ -451,27 +451,27 @@ define dso_local void @f1(float* noalias %aa, float* noalias %bb, float* noalias
 ; VF-FOUR-CHECK-NEXT:    [[N_VEC26:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF25]]
 ; VF-FOUR-CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; VF-FOUR-CHECK:       vec.epilog.vector.body:
-; VF-FOUR-CHECK-NEXT:    [[INDEX27:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT28:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; VF-FOUR-CHECK-NEXT:    [[TMP133:%.*]] = add i64 [[INDEX27]], 0
+; VF-FOUR-CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT31:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; VF-FOUR-CHECK-NEXT:    [[TMP133:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; VF-FOUR-CHECK-NEXT:    [[TMP134:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP133]]
 ; VF-FOUR-CHECK-NEXT:    [[TMP135:%.*]] = getelementptr inbounds float, float* [[TMP134]], i32 0
 ; VF-FOUR-CHECK-NEXT:    [[TMP136:%.*]] = bitcast float* [[TMP135]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD30:%.*]] = load <4 x float>, <4 x float>* [[TMP136]], align 4
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD29:%.*]] = load <4 x float>, <4 x float>* [[TMP136]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[TMP137:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP133]]
 ; VF-FOUR-CHECK-NEXT:    [[TMP138:%.*]] = getelementptr inbounds float, float* [[TMP137]], i32 0
 ; VF-FOUR-CHECK-NEXT:    [[TMP139:%.*]] = bitcast float* [[TMP138]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD31:%.*]] = load <4 x float>, <4 x float>* [[TMP139]], align 4
-; VF-FOUR-CHECK-NEXT:    [[TMP140:%.*]] = fadd fast <4 x float> [[WIDE_LOAD30]], [[WIDE_LOAD31]]
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD30:%.*]] = load <4 x float>, <4 x float>* [[TMP139]], align 4
+; VF-FOUR-CHECK-NEXT:    [[TMP140:%.*]] = fadd fast <4 x float> [[WIDE_LOAD29]], [[WIDE_LOAD30]]
 ; VF-FOUR-CHECK-NEXT:    [[TMP141:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP133]]
 ; VF-FOUR-CHECK-NEXT:    [[TMP142:%.*]] = getelementptr inbounds float, float* [[TMP141]], i32 0
 ; VF-FOUR-CHECK-NEXT:    [[TMP143:%.*]] = bitcast float* [[TMP142]] to <4 x float>*
 ; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP140]], <4 x float>* [[TMP143]], align 4
-; VF-FOUR-CHECK-NEXT:    [[INDEX_NEXT28]] = add nuw i64 [[INDEX27]], 4
-; VF-FOUR-CHECK-NEXT:    [[TMP144:%.*]] = icmp eq i64 [[INDEX_NEXT28]], [[N_VEC26]]
+; VF-FOUR-CHECK-NEXT:    [[INDEX_NEXT31]] = add nuw i64 [[OFFSET_IDX]], 4
+; VF-FOUR-CHECK-NEXT:    [[TMP144:%.*]] = icmp eq i64 [[INDEX_NEXT31]], [[N_VEC26]]
 ; VF-FOUR-CHECK-NEXT:    br i1 [[TMP144]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; VF-FOUR-CHECK:       vec.epilog.middle.block:
-; VF-FOUR-CHECK-NEXT:    [[CMP_N29:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC26]]
-; VF-FOUR-CHECK-NEXT:    br i1 [[CMP_N29]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
+; VF-FOUR-CHECK-NEXT:    [[CMP_N27:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC26]]
+; VF-FOUR-CHECK-NEXT:    br i1 [[CMP_N27]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
 ; VF-FOUR-CHECK:       vec.epilog.scalar.ph:
 ; VF-FOUR-CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC26]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ]
 ; VF-FOUR-CHECK-NEXT:    br label [[FOR_BODY:%.*]]
@@ -540,12 +540,12 @@ define dso_local signext i32 @f2(float* noalias %A, float* noalias %B, i32 signe
 ; VF-TWO-CHECK-NEXT:    [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP2]])
 ; VF-TWO-CHECK-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
 ; VF-TWO-CHECK-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
-; VF-TWO-CHECK-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP0]], [[MUL_RESULT]]
-; VF-TWO-CHECK-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP4]], [[TMP0]]
-; VF-TWO-CHECK-NEXT:    [[TMP9:%.*]] = or i1 [[TMP5]], [[MUL_OVERFLOW]]
-; VF-TWO-CHECK-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[TMP1]], 4294967295
-; VF-TWO-CHECK-NEXT:    [[TMP10:%.*]] = or i1 [[TMP9]], [[TMP8]]
-; VF-TWO-CHECK-NEXT:    br i1 [[TMP10]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; VF-TWO-CHECK-NEXT:    [[TMP3:%.*]] = sub i32 [[TMP0]], [[MUL_RESULT]]
+; VF-TWO-CHECK-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP3]], [[TMP0]]
+; VF-TWO-CHECK-NEXT:    [[TMP5:%.*]] = or i1 [[TMP4]], [[MUL_OVERFLOW]]
+; VF-TWO-CHECK-NEXT:    [[TMP6:%.*]] = icmp ugt i64 [[TMP1]], 4294967295
+; VF-TWO-CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; VF-TWO-CHECK-NEXT:    br i1 [[TMP7]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
 ; VF-TWO-CHECK:       vector.main.loop.iter.check:
 ; VF-TWO-CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 32
 ; VF-TWO-CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -556,142 +556,142 @@ define dso_local signext i32 @f2(float* noalias %A, float* noalias %B, i32 signe
 ; VF-TWO-CHECK:       vector.body:
 ; VF-TWO-CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF-TWO-CHECK-NEXT:    [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32
-; VF-TWO-CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[OFFSET_IDX]], 0
-; VF-TWO-CHECK-NEXT:    [[TMP21:%.*]] = add i32 [[OFFSET_IDX]], 4
-; VF-TWO-CHECK-NEXT:    [[TMP22:%.*]] = add i32 [[OFFSET_IDX]], 8
-; VF-TWO-CHECK-NEXT:    [[TMP23:%.*]] = add i32 [[OFFSET_IDX]], 12
-; VF-TWO-CHECK-NEXT:    [[TMP24:%.*]] = add i32 [[OFFSET_IDX]], 16
-; VF-TWO-CHECK-NEXT:    [[TMP25:%.*]] = add i32 [[OFFSET_IDX]], 20
-; VF-TWO-CHECK-NEXT:    [[TMP26:%.*]] = add i32 [[OFFSET_IDX]], 24
-; VF-TWO-CHECK-NEXT:    [[TMP27:%.*]] = add i32 [[OFFSET_IDX]], 28
-; VF-TWO-CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
-; VF-TWO-CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 4
-; VF-TWO-CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 8
-; VF-TWO-CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 12
-; VF-TWO-CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX]], 16
-; VF-TWO-CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 20
-; VF-TWO-CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[INDEX]], 24
-; VF-TWO-CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX]], 28
-; VF-TWO-CHECK-NEXT:    [[TMP28:%.*]] = xor i32 [[TMP20]], -1
-; VF-TWO-CHECK-NEXT:    [[TMP29:%.*]] = xor i32 [[TMP21]], -1
-; VF-TWO-CHECK-NEXT:    [[TMP30:%.*]] = xor i32 [[TMP22]], -1
-; VF-TWO-CHECK-NEXT:    [[TMP31:%.*]] = xor i32 [[TMP23]], -1
-; VF-TWO-CHECK-NEXT:    [[TMP32:%.*]] = xor i32 [[TMP24]], -1
-; VF-TWO-CHECK-NEXT:    [[TMP33:%.*]] = xor i32 [[TMP25]], -1
-; VF-TWO-CHECK-NEXT:    [[TMP34:%.*]] = xor i32 [[TMP26]], -1
-; VF-TWO-CHECK-NEXT:    [[TMP35:%.*]] = xor i32 [[TMP27]], -1
+; VF-TWO-CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[OFFSET_IDX]], 0
+; VF-TWO-CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[OFFSET_IDX]], 4
+; VF-TWO-CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[OFFSET_IDX]], 8
+; VF-TWO-CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[OFFSET_IDX]], 12
+; VF-TWO-CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[OFFSET_IDX]], 16
+; VF-TWO-CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[OFFSET_IDX]], 20
+; VF-TWO-CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[OFFSET_IDX]], 24
+; VF-TWO-CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[OFFSET_IDX]], 28
+; VF-TWO-CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX]], 0
+; VF-TWO-CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 4
+; VF-TWO-CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[INDEX]], 8
+; VF-TWO-CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX]], 12
+; VF-TWO-CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[INDEX]], 16
+; VF-TWO-CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX]], 20
+; VF-TWO-CHECK-NEXT:    [[TMP22:%.*]] = add i64 [[INDEX]], 24
+; VF-TWO-CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[INDEX]], 28
+; VF-TWO-CHECK-NEXT:    [[TMP24:%.*]] = xor i32 [[TMP8]], -1
+; VF-TWO-CHECK-NEXT:    [[TMP25:%.*]] = xor i32 [[TMP9]], -1
+; VF-TWO-CHECK-NEXT:    [[TMP26:%.*]] = xor i32 [[TMP10]], -1
+; VF-TWO-CHECK-NEXT:    [[TMP27:%.*]] = xor i32 [[TMP11]], -1
+; VF-TWO-CHECK-NEXT:    [[TMP28:%.*]] = xor i32 [[TMP12]], -1
+; VF-TWO-CHECK-NEXT:    [[TMP29:%.*]] = xor i32 [[TMP13]], -1
+; VF-TWO-CHECK-NEXT:    [[TMP30:%.*]] = xor i32 [[TMP14]], -1
+; VF-TWO-CHECK-NEXT:    [[TMP31:%.*]] = xor i32 [[TMP15]], -1
+; VF-TWO-CHECK-NEXT:    [[TMP32:%.*]] = add i32 [[TMP24]], [[N]]
+; VF-TWO-CHECK-NEXT:    [[TMP33:%.*]] = add i32 [[TMP25]], [[N]]
+; VF-TWO-CHECK-NEXT:    [[TMP34:%.*]] = add i32 [[TMP26]], [[N]]
+; VF-TWO-CHECK-NEXT:    [[TMP35:%.*]] = add i32 [[TMP27]], [[N]]
 ; VF-TWO-CHECK-NEXT:    [[TMP36:%.*]] = add i32 [[TMP28]], [[N]]
 ; VF-TWO-CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP29]], [[N]]
 ; VF-TWO-CHECK-NEXT:    [[TMP38:%.*]] = add i32 [[TMP30]], [[N]]
 ; VF-TWO-CHECK-NEXT:    [[TMP39:%.*]] = add i32 [[TMP31]], [[N]]
-; VF-TWO-CHECK-NEXT:    [[TMP40:%.*]] = add i32 [[TMP32]], [[N]]
-; VF-TWO-CHECK-NEXT:    [[TMP41:%.*]] = add i32 [[TMP33]], [[N]]
-; VF-TWO-CHECK-NEXT:    [[TMP42:%.*]] = add i32 [[TMP34]], [[N]]
-; VF-TWO-CHECK-NEXT:    [[TMP43:%.*]] = add i32 [[TMP35]], [[N]]
+; VF-TWO-CHECK-NEXT:    [[TMP40:%.*]] = sext i32 [[TMP32]] to i64
+; VF-TWO-CHECK-NEXT:    [[TMP41:%.*]] = sext i32 [[TMP33]] to i64
+; VF-TWO-CHECK-NEXT:    [[TMP42:%.*]] = sext i32 [[TMP34]] to i64
+; VF-TWO-CHECK-NEXT:    [[TMP43:%.*]] = sext i32 [[TMP35]] to i64
 ; VF-TWO-CHECK-NEXT:    [[TMP44:%.*]] = sext i32 [[TMP36]] to i64
 ; VF-TWO-CHECK-NEXT:    [[TMP45:%.*]] = sext i32 [[TMP37]] to i64
 ; VF-TWO-CHECK-NEXT:    [[TMP46:%.*]] = sext i32 [[TMP38]] to i64
 ; VF-TWO-CHECK-NEXT:    [[TMP47:%.*]] = sext i32 [[TMP39]] to i64
-; VF-TWO-CHECK-NEXT:    [[TMP48:%.*]] = sext i32 [[TMP40]] to i64
-; VF-TWO-CHECK-NEXT:    [[TMP49:%.*]] = sext i32 [[TMP41]] to i64
-; VF-TWO-CHECK-NEXT:    [[TMP50:%.*]] = sext i32 [[TMP42]] to i64
-; VF-TWO-CHECK-NEXT:    [[TMP51:%.*]] = sext i32 [[TMP43]] to i64
-; VF-TWO-CHECK-NEXT:    [[TMP52:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP44]]
+; VF-TWO-CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP40]]
+; VF-TWO-CHECK-NEXT:    [[TMP49:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP41]]
+; VF-TWO-CHECK-NEXT:    [[TMP50:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP42]]
+; VF-TWO-CHECK-NEXT:    [[TMP51:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP43]]
+; VF-TWO-CHECK-NEXT:    [[TMP52:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP44]]
 ; VF-TWO-CHECK-NEXT:    [[TMP53:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP45]]
 ; VF-TWO-CHECK-NEXT:    [[TMP54:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP46]]
 ; VF-TWO-CHECK-NEXT:    [[TMP55:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP47]]
-; VF-TWO-CHECK-NEXT:    [[TMP56:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP48]]
-; VF-TWO-CHECK-NEXT:    [[TMP57:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP49]]
-; VF-TWO-CHECK-NEXT:    [[TMP58:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP50]]
-; VF-TWO-CHECK-NEXT:    [[TMP59:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP51]]
-; VF-TWO-CHECK-NEXT:    [[TMP60:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 0
-; VF-TWO-CHECK-NEXT:    [[TMP61:%.*]] = getelementptr inbounds float, float* [[TMP60]], i32 -3
-; VF-TWO-CHECK-NEXT:    [[TMP62:%.*]] = bitcast float* [[TMP61]] to <4 x float>*
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP62]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP56:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 0
+; VF-TWO-CHECK-NEXT:    [[TMP57:%.*]] = getelementptr inbounds float, float* [[TMP56]], i32 -3
+; VF-TWO-CHECK-NEXT:    [[TMP58:%.*]] = bitcast float* [[TMP57]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP58]], align 4
 ; VF-TWO-CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x float> [[WIDE_LOAD]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-TWO-CHECK-NEXT:    [[TMP63:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -4
-; VF-TWO-CHECK-NEXT:    [[TMP64:%.*]] = getelementptr inbounds float, float* [[TMP63]], i32 -3
-; VF-TWO-CHECK-NEXT:    [[TMP65:%.*]] = bitcast float* [[TMP64]] to <4 x float>*
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, <4 x float>* [[TMP65]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP59:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -4
+; VF-TWO-CHECK-NEXT:    [[TMP60:%.*]] = getelementptr inbounds float, float* [[TMP59]], i32 -3
+; VF-TWO-CHECK-NEXT:    [[TMP61:%.*]] = bitcast float* [[TMP60]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, <4 x float>* [[TMP61]], align 4
 ; VF-TWO-CHECK-NEXT:    [[REVERSE3:%.*]] = shufflevector <4 x float> [[WIDE_LOAD2]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-TWO-CHECK-NEXT:    [[TMP66:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -8
-; VF-TWO-CHECK-NEXT:    [[TMP67:%.*]] = getelementptr inbounds float, float* [[TMP66]], i32 -3
-; VF-TWO-CHECK-NEXT:    [[TMP68:%.*]] = bitcast float* [[TMP67]] to <4 x float>*
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x float>, <4 x float>* [[TMP68]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP62:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -8
+; VF-TWO-CHECK-NEXT:    [[TMP63:%.*]] = getelementptr inbounds float, float* [[TMP62]], i32 -3
+; VF-TWO-CHECK-NEXT:    [[TMP64:%.*]] = bitcast float* [[TMP63]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x float>, <4 x float>* [[TMP64]], align 4
 ; VF-TWO-CHECK-NEXT:    [[REVERSE5:%.*]] = shufflevector <4 x float> [[WIDE_LOAD4]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-TWO-CHECK-NEXT:    [[TMP69:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -12
-; VF-TWO-CHECK-NEXT:    [[TMP70:%.*]] = getelementptr inbounds float, float* [[TMP69]], i32 -3
-; VF-TWO-CHECK-NEXT:    [[TMP71:%.*]] = bitcast float* [[TMP70]] to <4 x float>*
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, <4 x float>* [[TMP71]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP65:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -12
+; VF-TWO-CHECK-NEXT:    [[TMP66:%.*]] = getelementptr inbounds float, float* [[TMP65]], i32 -3
+; VF-TWO-CHECK-NEXT:    [[TMP67:%.*]] = bitcast float* [[TMP66]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, <4 x float>* [[TMP67]], align 4
 ; VF-TWO-CHECK-NEXT:    [[REVERSE7:%.*]] = shufflevector <4 x float> [[WIDE_LOAD6]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-TWO-CHECK-NEXT:    [[TMP72:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -16
-; VF-TWO-CHECK-NEXT:    [[TMP73:%.*]] = getelementptr inbounds float, float* [[TMP72]], i32 -3
-; VF-TWO-CHECK-NEXT:    [[TMP74:%.*]] = bitcast float* [[TMP73]] to <4 x float>*
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x float>, <4 x float>* [[TMP74]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP68:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -16
+; VF-TWO-CHECK-NEXT:    [[TMP69:%.*]] = getelementptr inbounds float, float* [[TMP68]], i32 -3
+; VF-TWO-CHECK-NEXT:    [[TMP70:%.*]] = bitcast float* [[TMP69]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x float>, <4 x float>* [[TMP70]], align 4
 ; VF-TWO-CHECK-NEXT:    [[REVERSE9:%.*]] = shufflevector <4 x float> [[WIDE_LOAD8]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-TWO-CHECK-NEXT:    [[TMP75:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -20
-; VF-TWO-CHECK-NEXT:    [[TMP76:%.*]] = getelementptr inbounds float, float* [[TMP75]], i32 -3
-; VF-TWO-CHECK-NEXT:    [[TMP77:%.*]] = bitcast float* [[TMP76]] to <4 x float>*
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x float>, <4 x float>* [[TMP77]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP71:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -20
+; VF-TWO-CHECK-NEXT:    [[TMP72:%.*]] = getelementptr inbounds float, float* [[TMP71]], i32 -3
+; VF-TWO-CHECK-NEXT:    [[TMP73:%.*]] = bitcast float* [[TMP72]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x float>, <4 x float>* [[TMP73]], align 4
 ; VF-TWO-CHECK-NEXT:    [[REVERSE11:%.*]] = shufflevector <4 x float> [[WIDE_LOAD10]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-TWO-CHECK-NEXT:    [[TMP78:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -24
-; VF-TWO-CHECK-NEXT:    [[TMP79:%.*]] = getelementptr inbounds float, float* [[TMP78]], i32 -3
-; VF-TWO-CHECK-NEXT:    [[TMP80:%.*]] = bitcast float* [[TMP79]] to <4 x float>*
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, <4 x float>* [[TMP80]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP74:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -24
+; VF-TWO-CHECK-NEXT:    [[TMP75:%.*]] = getelementptr inbounds float, float* [[TMP74]], i32 -3
+; VF-TWO-CHECK-NEXT:    [[TMP76:%.*]] = bitcast float* [[TMP75]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, <4 x float>* [[TMP76]], align 4
 ; VF-TWO-CHECK-NEXT:    [[REVERSE13:%.*]] = shufflevector <4 x float> [[WIDE_LOAD12]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-TWO-CHECK-NEXT:    [[TMP81:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -28
-; VF-TWO-CHECK-NEXT:    [[TMP82:%.*]] = getelementptr inbounds float, float* [[TMP81]], i32 -3
-; VF-TWO-CHECK-NEXT:    [[TMP83:%.*]] = bitcast float* [[TMP82]] to <4 x float>*
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x float>, <4 x float>* [[TMP83]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP77:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -28
+; VF-TWO-CHECK-NEXT:    [[TMP78:%.*]] = getelementptr inbounds float, float* [[TMP77]], i32 -3
+; VF-TWO-CHECK-NEXT:    [[TMP79:%.*]] = bitcast float* [[TMP78]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x float>, <4 x float>* [[TMP79]], align 4
 ; VF-TWO-CHECK-NEXT:    [[REVERSE15:%.*]] = shufflevector <4 x float> [[WIDE_LOAD14]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-TWO-CHECK-NEXT:    [[TMP84:%.*]] = fadd fast <4 x float> [[REVERSE]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-TWO-CHECK-NEXT:    [[TMP85:%.*]] = fadd fast <4 x float> [[REVERSE3]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-TWO-CHECK-NEXT:    [[TMP86:%.*]] = fadd fast <4 x float> [[REVERSE5]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-TWO-CHECK-NEXT:    [[TMP87:%.*]] = fadd fast <4 x float> [[REVERSE7]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-TWO-CHECK-NEXT:    [[TMP88:%.*]] = fadd fast <4 x float> [[REVERSE9]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-TWO-CHECK-NEXT:    [[TMP89:%.*]] = fadd fast <4 x float> [[REVERSE11]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-TWO-CHECK-NEXT:    [[TMP90:%.*]] = fadd fast <4 x float> [[REVERSE13]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-TWO-CHECK-NEXT:    [[TMP91:%.*]] = fadd fast <4 x float> [[REVERSE15]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-TWO-CHECK-NEXT:    [[TMP92:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP12]]
-; VF-TWO-CHECK-NEXT:    [[TMP93:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP13]]
-; VF-TWO-CHECK-NEXT:    [[TMP94:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP14]]
-; VF-TWO-CHECK-NEXT:    [[TMP95:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP15]]
-; VF-TWO-CHECK-NEXT:    [[TMP96:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP16]]
-; VF-TWO-CHECK-NEXT:    [[TMP97:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP17]]
-; VF-TWO-CHECK-NEXT:    [[TMP98:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP18]]
-; VF-TWO-CHECK-NEXT:    [[TMP99:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP19]]
-; VF-TWO-CHECK-NEXT:    [[TMP100:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 0
+; VF-TWO-CHECK-NEXT:    [[TMP80:%.*]] = fadd fast <4 x float> [[REVERSE]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-TWO-CHECK-NEXT:    [[TMP81:%.*]] = fadd fast <4 x float> [[REVERSE3]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-TWO-CHECK-NEXT:    [[TMP82:%.*]] = fadd fast <4 x float> [[REVERSE5]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-TWO-CHECK-NEXT:    [[TMP83:%.*]] = fadd fast <4 x float> [[REVERSE7]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-TWO-CHECK-NEXT:    [[TMP84:%.*]] = fadd fast <4 x float> [[REVERSE9]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-TWO-CHECK-NEXT:    [[TMP85:%.*]] = fadd fast <4 x float> [[REVERSE11]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-TWO-CHECK-NEXT:    [[TMP86:%.*]] = fadd fast <4 x float> [[REVERSE13]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-TWO-CHECK-NEXT:    [[TMP87:%.*]] = fadd fast <4 x float> [[REVERSE15]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-TWO-CHECK-NEXT:    [[TMP88:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP16]]
+; VF-TWO-CHECK-NEXT:    [[TMP89:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP17]]
+; VF-TWO-CHECK-NEXT:    [[TMP90:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP18]]
+; VF-TWO-CHECK-NEXT:    [[TMP91:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP19]]
+; VF-TWO-CHECK-NEXT:    [[TMP92:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP20]]
+; VF-TWO-CHECK-NEXT:    [[TMP93:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP21]]
+; VF-TWO-CHECK-NEXT:    [[TMP94:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP22]]
+; VF-TWO-CHECK-NEXT:    [[TMP95:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP23]]
+; VF-TWO-CHECK-NEXT:    [[TMP96:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 0
+; VF-TWO-CHECK-NEXT:    [[TMP97:%.*]] = bitcast float* [[TMP96]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP80]], <4 x float>* [[TMP97]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP98:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 4
+; VF-TWO-CHECK-NEXT:    [[TMP99:%.*]] = bitcast float* [[TMP98]] to <4 x float>*
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP81]], <4 x float>* [[TMP99]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP100:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 8
 ; VF-TWO-CHECK-NEXT:    [[TMP101:%.*]] = bitcast float* [[TMP100]] to <4 x float>*
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP84]], <4 x float>* [[TMP101]], align 4
-; VF-TWO-CHECK-NEXT:    [[TMP102:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 4
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP82]], <4 x float>* [[TMP101]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP102:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 12
 ; VF-TWO-CHECK-NEXT:    [[TMP103:%.*]] = bitcast float* [[TMP102]] to <4 x float>*
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP85]], <4 x float>* [[TMP103]], align 4
-; VF-TWO-CHECK-NEXT:    [[TMP104:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 8
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP83]], <4 x float>* [[TMP103]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP104:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 16
 ; VF-TWO-CHECK-NEXT:    [[TMP105:%.*]] = bitcast float* [[TMP104]] to <4 x float>*
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP86]], <4 x float>* [[TMP105]], align 4
-; VF-TWO-CHECK-NEXT:    [[TMP106:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 12
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP84]], <4 x float>* [[TMP105]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP106:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 20
 ; VF-TWO-CHECK-NEXT:    [[TMP107:%.*]] = bitcast float* [[TMP106]] to <4 x float>*
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP87]], <4 x float>* [[TMP107]], align 4
-; VF-TWO-CHECK-NEXT:    [[TMP108:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 16
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP85]], <4 x float>* [[TMP107]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP108:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 24
 ; VF-TWO-CHECK-NEXT:    [[TMP109:%.*]] = bitcast float* [[TMP108]] to <4 x float>*
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP88]], <4 x float>* [[TMP109]], align 4
-; VF-TWO-CHECK-NEXT:    [[TMP110:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 20
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP86]], <4 x float>* [[TMP109]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP110:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 28
 ; VF-TWO-CHECK-NEXT:    [[TMP111:%.*]] = bitcast float* [[TMP110]] to <4 x float>*
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP89]], <4 x float>* [[TMP111]], align 4
-; VF-TWO-CHECK-NEXT:    [[TMP112:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 24
-; VF-TWO-CHECK-NEXT:    [[TMP113:%.*]] = bitcast float* [[TMP112]] to <4 x float>*
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP90]], <4 x float>* [[TMP113]], align 4
-; VF-TWO-CHECK-NEXT:    [[TMP114:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 28
-; VF-TWO-CHECK-NEXT:    [[TMP115:%.*]] = bitcast float* [[TMP114]] to <4 x float>*
-; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP91]], <4 x float>* [[TMP115]], align 4
+; VF-TWO-CHECK-NEXT:    store <4 x float> [[TMP87]], <4 x float>* [[TMP111]], align 4
 ; VF-TWO-CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; VF-TWO-CHECK-NEXT:    [[TMP116:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; VF-TWO-CHECK-NEXT:    br i1 [[TMP116]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF-TWO-CHECK-NEXT:    [[TMP112:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF-TWO-CHECK-NEXT:    br i1 [[TMP112]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; VF-TWO-CHECK:       middle.block:
 ; VF-TWO-CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; VF-TWO-CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; VF-TWO-CHECK:       vec.epilog.iter.check:
-; VF-TWO-CHECK-NEXT:    [[IND_END21:%.*]] = trunc i64 [[N_VEC]] to i32
+; VF-TWO-CHECK-NEXT:    [[IND_END19:%.*]] = trunc i64 [[N_VEC]] to i32
 ; VF-TWO-CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; VF-TWO-CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2
 ; VF-TWO-CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
@@ -702,49 +702,49 @@ define dso_local signext i32 @f2(float* noalias %A, float* noalias %B, i32 signe
 ; VF-TWO-CHECK-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC17]] to i32
 ; VF-TWO-CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; VF-TWO-CHECK:       vec.epilog.vector.body:
-; VF-TWO-CHECK-NEXT:    [[INDEX18:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT19:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; VF-TWO-CHECK-NEXT:    [[OFFSET_IDX23:%.*]] = trunc i64 [[INDEX18]] to i32
-; VF-TWO-CHECK-NEXT:    [[TMP118:%.*]] = add i32 [[OFFSET_IDX23]], 0
-; VF-TWO-CHECK-NEXT:    [[TMP117:%.*]] = add i64 [[INDEX18]], 0
-; VF-TWO-CHECK-NEXT:    [[TMP119:%.*]] = xor i32 [[TMP118]], -1
-; VF-TWO-CHECK-NEXT:    [[TMP120:%.*]] = add i32 [[TMP119]], [[N]]
-; VF-TWO-CHECK-NEXT:    [[TMP121:%.*]] = sext i32 [[TMP120]] to i64
-; VF-TWO-CHECK-NEXT:    [[TMP122:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP121]]
-; VF-TWO-CHECK-NEXT:    [[TMP123:%.*]] = getelementptr inbounds float, float* [[TMP122]], i32 0
-; VF-TWO-CHECK-NEXT:    [[TMP124:%.*]] = getelementptr inbounds float, float* [[TMP123]], i32 -1
-; VF-TWO-CHECK-NEXT:    [[TMP125:%.*]] = bitcast float* [[TMP124]] to <2 x float>*
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD24:%.*]] = load <2 x float>, <2 x float>* [[TMP125]], align 4
+; VF-TWO-CHECK-NEXT:    [[OFFSET_IDX23:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT26:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; VF-TWO-CHECK-NEXT:    [[OFFSET_IDX22:%.*]] = trunc i64 [[OFFSET_IDX23]] to i32
+; VF-TWO-CHECK-NEXT:    [[TMP113:%.*]] = add i32 [[OFFSET_IDX22]], 0
+; VF-TWO-CHECK-NEXT:    [[TMP114:%.*]] = add i64 [[OFFSET_IDX23]], 0
+; VF-TWO-CHECK-NEXT:    [[TMP115:%.*]] = xor i32 [[TMP113]], -1
+; VF-TWO-CHECK-NEXT:    [[TMP116:%.*]] = add i32 [[TMP115]], [[N]]
+; VF-TWO-CHECK-NEXT:    [[TMP117:%.*]] = sext i32 [[TMP116]] to i64
+; VF-TWO-CHECK-NEXT:    [[TMP118:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP117]]
+; VF-TWO-CHECK-NEXT:    [[TMP119:%.*]] = getelementptr inbounds float, float* [[TMP118]], i32 0
+; VF-TWO-CHECK-NEXT:    [[TMP120:%.*]] = getelementptr inbounds float, float* [[TMP119]], i32 -1
+; VF-TWO-CHECK-NEXT:    [[TMP121:%.*]] = bitcast float* [[TMP120]] to <2 x float>*
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD24:%.*]] = load <2 x float>, <2 x float>* [[TMP121]], align 4
 ; VF-TWO-CHECK-NEXT:    [[REVERSE25:%.*]] = shufflevector <2 x float> [[WIDE_LOAD24]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; VF-TWO-CHECK-NEXT:    [[TMP126:%.*]] = fadd fast <2 x float> [[REVERSE25]], <float 1.000000e+00, float 1.000000e+00>
-; VF-TWO-CHECK-NEXT:    [[TMP127:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP117]]
-; VF-TWO-CHECK-NEXT:    [[TMP128:%.*]] = getelementptr inbounds float, float* [[TMP127]], i32 0
-; VF-TWO-CHECK-NEXT:    [[TMP129:%.*]] = bitcast float* [[TMP128]] to <2 x float>*
-; VF-TWO-CHECK-NEXT:    store <2 x float> [[TMP126]], <2 x float>* [[TMP129]], align 4
-; VF-TWO-CHECK-NEXT:    [[INDEX_NEXT19]] = add nuw i64 [[INDEX18]], 2
-; VF-TWO-CHECK-NEXT:    [[TMP130:%.*]] = icmp eq i64 [[INDEX_NEXT19]], [[N_VEC17]]
-; VF-TWO-CHECK-NEXT:    br i1 [[TMP130]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF-TWO-CHECK-NEXT:    [[TMP122:%.*]] = fadd fast <2 x float> [[REVERSE25]], <float 1.000000e+00, float 1.000000e+00>
+; VF-TWO-CHECK-NEXT:    [[TMP123:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP114]]
+; VF-TWO-CHECK-NEXT:    [[TMP124:%.*]] = getelementptr inbounds float, float* [[TMP123]], i32 0
+; VF-TWO-CHECK-NEXT:    [[TMP125:%.*]] = bitcast float* [[TMP124]] to <2 x float>*
+; VF-TWO-CHECK-NEXT:    store <2 x float> [[TMP122]], <2 x float>* [[TMP125]], align 4
+; VF-TWO-CHECK-NEXT:    [[INDEX_NEXT26]] = add nuw i64 [[OFFSET_IDX23]], 2
+; VF-TWO-CHECK-NEXT:    [[TMP126:%.*]] = icmp eq i64 [[INDEX_NEXT26]], [[N_VEC17]]
+; VF-TWO-CHECK-NEXT:    br i1 [[TMP126]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; VF-TWO-CHECK:       vec.epilog.middle.block:
-; VF-TWO-CHECK-NEXT:    [[CMP_N22:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC17]]
-; VF-TWO-CHECK-NEXT:    br i1 [[CMP_N22]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
+; VF-TWO-CHECK-NEXT:    [[CMP_N20:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC17]]
+; VF-TWO-CHECK-NEXT:    br i1 [[CMP_N20]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
 ; VF-TWO-CHECK:       vec.epilog.scalar.ph:
 ; VF-TWO-CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC17]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ]
-; VF-TWO-CHECK-NEXT:    [[BC_RESUME_VAL20:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END21]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ]
+; VF-TWO-CHECK-NEXT:    [[BC_RESUME_VAL18:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END19]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ]
 ; VF-TWO-CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; VF-TWO-CHECK:       for.body:
 ; VF-TWO-CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; VF-TWO-CHECK-NEXT:    [[I_014:%.*]] = phi i32 [ [[BC_RESUME_VAL20]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; VF-TWO-CHECK-NEXT:    [[TMP131:%.*]] = xor i32 [[I_014]], -1
-; VF-TWO-CHECK-NEXT:    [[SUB2:%.*]] = add i32 [[TMP131]], [[N]]
+; VF-TWO-CHECK-NEXT:    [[I_014:%.*]] = phi i32 [ [[BC_RESUME_VAL18]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; VF-TWO-CHECK-NEXT:    [[TMP127:%.*]] = xor i32 [[I_014]], -1
+; VF-TWO-CHECK-NEXT:    [[SUB2:%.*]] = add i32 [[TMP127]], [[N]]
 ; VF-TWO-CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[SUB2]] to i64
 ; VF-TWO-CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IDXPROM]]
-; VF-TWO-CHECK-NEXT:    [[TMP132:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; VF-TWO-CHECK-NEXT:    [[CONV3:%.*]] = fadd fast float [[TMP132]], 1.000000e+00
+; VF-TWO-CHECK-NEXT:    [[TMP128:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; VF-TWO-CHECK-NEXT:    [[CONV3:%.*]] = fadd fast float [[TMP128]], 1.000000e+00
 ; VF-TWO-CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
 ; VF-TWO-CHECK-NEXT:    store float [[CONV3]], float* [[ARRAYIDX5]], align 4
 ; VF-TWO-CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; VF-TWO-CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_014]], 1
 ; VF-TWO-CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; VF-TWO-CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]]
+; VF-TWO-CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOPID_MS_CM:![0-9]+]]
 ; VF-TWO-CHECK:       for.end.loopexit.loopexit:
 ; VF-TWO-CHECK-NEXT:    br label [[FOR_END_LOOPEXIT]]
 ; VF-TWO-CHECK:       for.end.loopexit:
@@ -767,12 +767,12 @@ define dso_local signext i32 @f2(float* noalias %A, float* noalias %B, i32 signe
 ; VF-FOUR-CHECK-NEXT:    [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP2]])
 ; VF-FOUR-CHECK-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
 ; VF-FOUR-CHECK-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
-; VF-FOUR-CHECK-NEXT:    [[TMP4:%.*]] = sub i32 [[TMP0]], [[MUL_RESULT]]
-; VF-FOUR-CHECK-NEXT:    [[TMP5:%.*]] = icmp sgt i32 [[TMP4]], [[TMP0]]
-; VF-FOUR-CHECK-NEXT:    [[TMP9:%.*]] = or i1 [[TMP5]], [[MUL_OVERFLOW]]
-; VF-FOUR-CHECK-NEXT:    [[TMP8:%.*]] = icmp ugt i64 [[TMP1]], 4294967295
-; VF-FOUR-CHECK-NEXT:    [[TMP10:%.*]] = or i1 [[TMP9]], [[TMP8]]
-; VF-FOUR-CHECK-NEXT:    br i1 [[TMP10]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; VF-FOUR-CHECK-NEXT:    [[TMP3:%.*]] = sub i32 [[TMP0]], [[MUL_RESULT]]
+; VF-FOUR-CHECK-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[TMP3]], [[TMP0]]
+; VF-FOUR-CHECK-NEXT:    [[TMP5:%.*]] = or i1 [[TMP4]], [[MUL_OVERFLOW]]
+; VF-FOUR-CHECK-NEXT:    [[TMP6:%.*]] = icmp ugt i64 [[TMP1]], 4294967295
+; VF-FOUR-CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; VF-FOUR-CHECK-NEXT:    br i1 [[TMP7]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
 ; VF-FOUR-CHECK:       vector.main.loop.iter.check:
 ; VF-FOUR-CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 32
 ; VF-FOUR-CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -783,142 +783,142 @@ define dso_local signext i32 @f2(float* noalias %A, float* noalias %B, i32 signe
 ; VF-FOUR-CHECK:       vector.body:
 ; VF-FOUR-CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VF-FOUR-CHECK-NEXT:    [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32
-; VF-FOUR-CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[OFFSET_IDX]], 0
-; VF-FOUR-CHECK-NEXT:    [[TMP21:%.*]] = add i32 [[OFFSET_IDX]], 4
-; VF-FOUR-CHECK-NEXT:    [[TMP22:%.*]] = add i32 [[OFFSET_IDX]], 8
-; VF-FOUR-CHECK-NEXT:    [[TMP23:%.*]] = add i32 [[OFFSET_IDX]], 12
-; VF-FOUR-CHECK-NEXT:    [[TMP24:%.*]] = add i32 [[OFFSET_IDX]], 16
-; VF-FOUR-CHECK-NEXT:    [[TMP25:%.*]] = add i32 [[OFFSET_IDX]], 20
-; VF-FOUR-CHECK-NEXT:    [[TMP26:%.*]] = add i32 [[OFFSET_IDX]], 24
-; VF-FOUR-CHECK-NEXT:    [[TMP27:%.*]] = add i32 [[OFFSET_IDX]], 28
-; VF-FOUR-CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
-; VF-FOUR-CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 4
-; VF-FOUR-CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 8
-; VF-FOUR-CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 12
-; VF-FOUR-CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX]], 16
-; VF-FOUR-CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 20
-; VF-FOUR-CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[INDEX]], 24
-; VF-FOUR-CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX]], 28
-; VF-FOUR-CHECK-NEXT:    [[TMP28:%.*]] = xor i32 [[TMP20]], -1
-; VF-FOUR-CHECK-NEXT:    [[TMP29:%.*]] = xor i32 [[TMP21]], -1
-; VF-FOUR-CHECK-NEXT:    [[TMP30:%.*]] = xor i32 [[TMP22]], -1
-; VF-FOUR-CHECK-NEXT:    [[TMP31:%.*]] = xor i32 [[TMP23]], -1
-; VF-FOUR-CHECK-NEXT:    [[TMP32:%.*]] = xor i32 [[TMP24]], -1
-; VF-FOUR-CHECK-NEXT:    [[TMP33:%.*]] = xor i32 [[TMP25]], -1
-; VF-FOUR-CHECK-NEXT:    [[TMP34:%.*]] = xor i32 [[TMP26]], -1
-; VF-FOUR-CHECK-NEXT:    [[TMP35:%.*]] = xor i32 [[TMP27]], -1
+; VF-FOUR-CHECK-NEXT:    [[TMP8:%.*]] = add i32 [[OFFSET_IDX]], 0
+; VF-FOUR-CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[OFFSET_IDX]], 4
+; VF-FOUR-CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[OFFSET_IDX]], 8
+; VF-FOUR-CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[OFFSET_IDX]], 12
+; VF-FOUR-CHECK-NEXT:    [[TMP12:%.*]] = add i32 [[OFFSET_IDX]], 16
+; VF-FOUR-CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[OFFSET_IDX]], 20
+; VF-FOUR-CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[OFFSET_IDX]], 24
+; VF-FOUR-CHECK-NEXT:    [[TMP15:%.*]] = add i32 [[OFFSET_IDX]], 28
+; VF-FOUR-CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX]], 0
+; VF-FOUR-CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 4
+; VF-FOUR-CHECK-NEXT:    [[TMP18:%.*]] = add i64 [[INDEX]], 8
+; VF-FOUR-CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX]], 12
+; VF-FOUR-CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[INDEX]], 16
+; VF-FOUR-CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX]], 20
+; VF-FOUR-CHECK-NEXT:    [[TMP22:%.*]] = add i64 [[INDEX]], 24
+; VF-FOUR-CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[INDEX]], 28
+; VF-FOUR-CHECK-NEXT:    [[TMP24:%.*]] = xor i32 [[TMP8]], -1
+; VF-FOUR-CHECK-NEXT:    [[TMP25:%.*]] = xor i32 [[TMP9]], -1
+; VF-FOUR-CHECK-NEXT:    [[TMP26:%.*]] = xor i32 [[TMP10]], -1
+; VF-FOUR-CHECK-NEXT:    [[TMP27:%.*]] = xor i32 [[TMP11]], -1
+; VF-FOUR-CHECK-NEXT:    [[TMP28:%.*]] = xor i32 [[TMP12]], -1
+; VF-FOUR-CHECK-NEXT:    [[TMP29:%.*]] = xor i32 [[TMP13]], -1
+; VF-FOUR-CHECK-NEXT:    [[TMP30:%.*]] = xor i32 [[TMP14]], -1
+; VF-FOUR-CHECK-NEXT:    [[TMP31:%.*]] = xor i32 [[TMP15]], -1
+; VF-FOUR-CHECK-NEXT:    [[TMP32:%.*]] = add i32 [[TMP24]], [[N]]
+; VF-FOUR-CHECK-NEXT:    [[TMP33:%.*]] = add i32 [[TMP25]], [[N]]
+; VF-FOUR-CHECK-NEXT:    [[TMP34:%.*]] = add i32 [[TMP26]], [[N]]
+; VF-FOUR-CHECK-NEXT:    [[TMP35:%.*]] = add i32 [[TMP27]], [[N]]
 ; VF-FOUR-CHECK-NEXT:    [[TMP36:%.*]] = add i32 [[TMP28]], [[N]]
 ; VF-FOUR-CHECK-NEXT:    [[TMP37:%.*]] = add i32 [[TMP29]], [[N]]
 ; VF-FOUR-CHECK-NEXT:    [[TMP38:%.*]] = add i32 [[TMP30]], [[N]]
 ; VF-FOUR-CHECK-NEXT:    [[TMP39:%.*]] = add i32 [[TMP31]], [[N]]
-; VF-FOUR-CHECK-NEXT:    [[TMP40:%.*]] = add i32 [[TMP32]], [[N]]
-; VF-FOUR-CHECK-NEXT:    [[TMP41:%.*]] = add i32 [[TMP33]], [[N]]
-; VF-FOUR-CHECK-NEXT:    [[TMP42:%.*]] = add i32 [[TMP34]], [[N]]
-; VF-FOUR-CHECK-NEXT:    [[TMP43:%.*]] = add i32 [[TMP35]], [[N]]
+; VF-FOUR-CHECK-NEXT:    [[TMP40:%.*]] = sext i32 [[TMP32]] to i64
+; VF-FOUR-CHECK-NEXT:    [[TMP41:%.*]] = sext i32 [[TMP33]] to i64
+; VF-FOUR-CHECK-NEXT:    [[TMP42:%.*]] = sext i32 [[TMP34]] to i64
+; VF-FOUR-CHECK-NEXT:    [[TMP43:%.*]] = sext i32 [[TMP35]] to i64
 ; VF-FOUR-CHECK-NEXT:    [[TMP44:%.*]] = sext i32 [[TMP36]] to i64
 ; VF-FOUR-CHECK-NEXT:    [[TMP45:%.*]] = sext i32 [[TMP37]] to i64
 ; VF-FOUR-CHECK-NEXT:    [[TMP46:%.*]] = sext i32 [[TMP38]] to i64
 ; VF-FOUR-CHECK-NEXT:    [[TMP47:%.*]] = sext i32 [[TMP39]] to i64
-; VF-FOUR-CHECK-NEXT:    [[TMP48:%.*]] = sext i32 [[TMP40]] to i64
-; VF-FOUR-CHECK-NEXT:    [[TMP49:%.*]] = sext i32 [[TMP41]] to i64
-; VF-FOUR-CHECK-NEXT:    [[TMP50:%.*]] = sext i32 [[TMP42]] to i64
-; VF-FOUR-CHECK-NEXT:    [[TMP51:%.*]] = sext i32 [[TMP43]] to i64
-; VF-FOUR-CHECK-NEXT:    [[TMP52:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP44]]
+; VF-FOUR-CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP40]]
+; VF-FOUR-CHECK-NEXT:    [[TMP49:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP41]]
+; VF-FOUR-CHECK-NEXT:    [[TMP50:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP42]]
+; VF-FOUR-CHECK-NEXT:    [[TMP51:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP43]]
+; VF-FOUR-CHECK-NEXT:    [[TMP52:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP44]]
 ; VF-FOUR-CHECK-NEXT:    [[TMP53:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP45]]
 ; VF-FOUR-CHECK-NEXT:    [[TMP54:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP46]]
 ; VF-FOUR-CHECK-NEXT:    [[TMP55:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP47]]
-; VF-FOUR-CHECK-NEXT:    [[TMP56:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP48]]
-; VF-FOUR-CHECK-NEXT:    [[TMP57:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP49]]
-; VF-FOUR-CHECK-NEXT:    [[TMP58:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP50]]
-; VF-FOUR-CHECK-NEXT:    [[TMP59:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP51]]
-; VF-FOUR-CHECK-NEXT:    [[TMP60:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 0
-; VF-FOUR-CHECK-NEXT:    [[TMP61:%.*]] = getelementptr inbounds float, float* [[TMP60]], i32 -3
-; VF-FOUR-CHECK-NEXT:    [[TMP62:%.*]] = bitcast float* [[TMP61]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP62]], align 4
+; VF-FOUR-CHECK-NEXT:    [[TMP56:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 0
+; VF-FOUR-CHECK-NEXT:    [[TMP57:%.*]] = getelementptr inbounds float, float* [[TMP56]], i32 -3
+; VF-FOUR-CHECK-NEXT:    [[TMP58:%.*]] = bitcast float* [[TMP57]] to <4 x float>*
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP58]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x float> [[WIDE_LOAD]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-FOUR-CHECK-NEXT:    [[TMP63:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -4
-; VF-FOUR-CHECK-NEXT:    [[TMP64:%.*]] = getelementptr inbounds float, float* [[TMP63]], i32 -3
-; VF-FOUR-CHECK-NEXT:    [[TMP65:%.*]] = bitcast float* [[TMP64]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, <4 x float>* [[TMP65]], align 4
+; VF-FOUR-CHECK-NEXT:    [[TMP59:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -4
+; VF-FOUR-CHECK-NEXT:    [[TMP60:%.*]] = getelementptr inbounds float, float* [[TMP59]], i32 -3
+; VF-FOUR-CHECK-NEXT:    [[TMP61:%.*]] = bitcast float* [[TMP60]] to <4 x float>*
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x float>, <4 x float>* [[TMP61]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[REVERSE3:%.*]] = shufflevector <4 x float> [[WIDE_LOAD2]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-FOUR-CHECK-NEXT:    [[TMP66:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -8
-; VF-FOUR-CHECK-NEXT:    [[TMP67:%.*]] = getelementptr inbounds float, float* [[TMP66]], i32 -3
-; VF-FOUR-CHECK-NEXT:    [[TMP68:%.*]] = bitcast float* [[TMP67]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x float>, <4 x float>* [[TMP68]], align 4
+; VF-FOUR-CHECK-NEXT:    [[TMP62:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -8
+; VF-FOUR-CHECK-NEXT:    [[TMP63:%.*]] = getelementptr inbounds float, float* [[TMP62]], i32 -3
+; VF-FOUR-CHECK-NEXT:    [[TMP64:%.*]] = bitcast float* [[TMP63]] to <4 x float>*
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x float>, <4 x float>* [[TMP64]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[REVERSE5:%.*]] = shufflevector <4 x float> [[WIDE_LOAD4]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-FOUR-CHECK-NEXT:    [[TMP69:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -12
-; VF-FOUR-CHECK-NEXT:    [[TMP70:%.*]] = getelementptr inbounds float, float* [[TMP69]], i32 -3
-; VF-FOUR-CHECK-NEXT:    [[TMP71:%.*]] = bitcast float* [[TMP70]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, <4 x float>* [[TMP71]], align 4
+; VF-FOUR-CHECK-NEXT:    [[TMP65:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -12
+; VF-FOUR-CHECK-NEXT:    [[TMP66:%.*]] = getelementptr inbounds float, float* [[TMP65]], i32 -3
+; VF-FOUR-CHECK-NEXT:    [[TMP67:%.*]] = bitcast float* [[TMP66]] to <4 x float>*
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, <4 x float>* [[TMP67]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[REVERSE7:%.*]] = shufflevector <4 x float> [[WIDE_LOAD6]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-FOUR-CHECK-NEXT:    [[TMP72:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -16
-; VF-FOUR-CHECK-NEXT:    [[TMP73:%.*]] = getelementptr inbounds float, float* [[TMP72]], i32 -3
-; VF-FOUR-CHECK-NEXT:    [[TMP74:%.*]] = bitcast float* [[TMP73]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x float>, <4 x float>* [[TMP74]], align 4
+; VF-FOUR-CHECK-NEXT:    [[TMP68:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -16
+; VF-FOUR-CHECK-NEXT:    [[TMP69:%.*]] = getelementptr inbounds float, float* [[TMP68]], i32 -3
+; VF-FOUR-CHECK-NEXT:    [[TMP70:%.*]] = bitcast float* [[TMP69]] to <4 x float>*
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x float>, <4 x float>* [[TMP70]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[REVERSE9:%.*]] = shufflevector <4 x float> [[WIDE_LOAD8]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-FOUR-CHECK-NEXT:    [[TMP75:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -20
-; VF-FOUR-CHECK-NEXT:    [[TMP76:%.*]] = getelementptr inbounds float, float* [[TMP75]], i32 -3
-; VF-FOUR-CHECK-NEXT:    [[TMP77:%.*]] = bitcast float* [[TMP76]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x float>, <4 x float>* [[TMP77]], align 4
+; VF-FOUR-CHECK-NEXT:    [[TMP71:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -20
+; VF-FOUR-CHECK-NEXT:    [[TMP72:%.*]] = getelementptr inbounds float, float* [[TMP71]], i32 -3
+; VF-FOUR-CHECK-NEXT:    [[TMP73:%.*]] = bitcast float* [[TMP72]] to <4 x float>*
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x float>, <4 x float>* [[TMP73]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[REVERSE11:%.*]] = shufflevector <4 x float> [[WIDE_LOAD10]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-FOUR-CHECK-NEXT:    [[TMP78:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -24
-; VF-FOUR-CHECK-NEXT:    [[TMP79:%.*]] = getelementptr inbounds float, float* [[TMP78]], i32 -3
-; VF-FOUR-CHECK-NEXT:    [[TMP80:%.*]] = bitcast float* [[TMP79]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, <4 x float>* [[TMP80]], align 4
+; VF-FOUR-CHECK-NEXT:    [[TMP74:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -24
+; VF-FOUR-CHECK-NEXT:    [[TMP75:%.*]] = getelementptr inbounds float, float* [[TMP74]], i32 -3
+; VF-FOUR-CHECK-NEXT:    [[TMP76:%.*]] = bitcast float* [[TMP75]] to <4 x float>*
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, <4 x float>* [[TMP76]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[REVERSE13:%.*]] = shufflevector <4 x float> [[WIDE_LOAD12]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-FOUR-CHECK-NEXT:    [[TMP81:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -28
-; VF-FOUR-CHECK-NEXT:    [[TMP82:%.*]] = getelementptr inbounds float, float* [[TMP81]], i32 -3
-; VF-FOUR-CHECK-NEXT:    [[TMP83:%.*]] = bitcast float* [[TMP82]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x float>, <4 x float>* [[TMP83]], align 4
+; VF-FOUR-CHECK-NEXT:    [[TMP77:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -28
+; VF-FOUR-CHECK-NEXT:    [[TMP78:%.*]] = getelementptr inbounds float, float* [[TMP77]], i32 -3
+; VF-FOUR-CHECK-NEXT:    [[TMP79:%.*]] = bitcast float* [[TMP78]] to <4 x float>*
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD14:%.*]] = load <4 x float>, <4 x float>* [[TMP79]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[REVERSE15:%.*]] = shufflevector <4 x float> [[WIDE_LOAD14]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-FOUR-CHECK-NEXT:    [[TMP84:%.*]] = fadd fast <4 x float> [[REVERSE]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-FOUR-CHECK-NEXT:    [[TMP85:%.*]] = fadd fast <4 x float> [[REVERSE3]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-FOUR-CHECK-NEXT:    [[TMP86:%.*]] = fadd fast <4 x float> [[REVERSE5]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-FOUR-CHECK-NEXT:    [[TMP87:%.*]] = fadd fast <4 x float> [[REVERSE7]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-FOUR-CHECK-NEXT:    [[TMP88:%.*]] = fadd fast <4 x float> [[REVERSE9]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-FOUR-CHECK-NEXT:    [[TMP89:%.*]] = fadd fast <4 x float> [[REVERSE11]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-FOUR-CHECK-NEXT:    [[TMP90:%.*]] = fadd fast <4 x float> [[REVERSE13]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-FOUR-CHECK-NEXT:    [[TMP91:%.*]] = fadd fast <4 x float> [[REVERSE15]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-FOUR-CHECK-NEXT:    [[TMP92:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP12]]
-; VF-FOUR-CHECK-NEXT:    [[TMP93:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP13]]
-; VF-FOUR-CHECK-NEXT:    [[TMP94:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP14]]
-; VF-FOUR-CHECK-NEXT:    [[TMP95:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP15]]
-; VF-FOUR-CHECK-NEXT:    [[TMP96:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP16]]
-; VF-FOUR-CHECK-NEXT:    [[TMP97:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP17]]
-; VF-FOUR-CHECK-NEXT:    [[TMP98:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP18]]
-; VF-FOUR-CHECK-NEXT:    [[TMP99:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP19]]
-; VF-FOUR-CHECK-NEXT:    [[TMP100:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 0
+; VF-FOUR-CHECK-NEXT:    [[TMP80:%.*]] = fadd fast <4 x float> [[REVERSE]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-FOUR-CHECK-NEXT:    [[TMP81:%.*]] = fadd fast <4 x float> [[REVERSE3]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-FOUR-CHECK-NEXT:    [[TMP82:%.*]] = fadd fast <4 x float> [[REVERSE5]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-FOUR-CHECK-NEXT:    [[TMP83:%.*]] = fadd fast <4 x float> [[REVERSE7]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-FOUR-CHECK-NEXT:    [[TMP84:%.*]] = fadd fast <4 x float> [[REVERSE9]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-FOUR-CHECK-NEXT:    [[TMP85:%.*]] = fadd fast <4 x float> [[REVERSE11]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-FOUR-CHECK-NEXT:    [[TMP86:%.*]] = fadd fast <4 x float> [[REVERSE13]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-FOUR-CHECK-NEXT:    [[TMP87:%.*]] = fadd fast <4 x float> [[REVERSE15]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-FOUR-CHECK-NEXT:    [[TMP88:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP16]]
+; VF-FOUR-CHECK-NEXT:    [[TMP89:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP17]]
+; VF-FOUR-CHECK-NEXT:    [[TMP90:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP18]]
+; VF-FOUR-CHECK-NEXT:    [[TMP91:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP19]]
+; VF-FOUR-CHECK-NEXT:    [[TMP92:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP20]]
+; VF-FOUR-CHECK-NEXT:    [[TMP93:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP21]]
+; VF-FOUR-CHECK-NEXT:    [[TMP94:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP22]]
+; VF-FOUR-CHECK-NEXT:    [[TMP95:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP23]]
+; VF-FOUR-CHECK-NEXT:    [[TMP96:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 0
+; VF-FOUR-CHECK-NEXT:    [[TMP97:%.*]] = bitcast float* [[TMP96]] to <4 x float>*
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP80]], <4 x float>* [[TMP97]], align 4
+; VF-FOUR-CHECK-NEXT:    [[TMP98:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 4
+; VF-FOUR-CHECK-NEXT:    [[TMP99:%.*]] = bitcast float* [[TMP98]] to <4 x float>*
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP81]], <4 x float>* [[TMP99]], align 4
+; VF-FOUR-CHECK-NEXT:    [[TMP100:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 8
 ; VF-FOUR-CHECK-NEXT:    [[TMP101:%.*]] = bitcast float* [[TMP100]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP84]], <4 x float>* [[TMP101]], align 4
-; VF-FOUR-CHECK-NEXT:    [[TMP102:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 4
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP82]], <4 x float>* [[TMP101]], align 4
+; VF-FOUR-CHECK-NEXT:    [[TMP102:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 12
 ; VF-FOUR-CHECK-NEXT:    [[TMP103:%.*]] = bitcast float* [[TMP102]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP85]], <4 x float>* [[TMP103]], align 4
-; VF-FOUR-CHECK-NEXT:    [[TMP104:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 8
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP83]], <4 x float>* [[TMP103]], align 4
+; VF-FOUR-CHECK-NEXT:    [[TMP104:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 16
 ; VF-FOUR-CHECK-NEXT:    [[TMP105:%.*]] = bitcast float* [[TMP104]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP86]], <4 x float>* [[TMP105]], align 4
-; VF-FOUR-CHECK-NEXT:    [[TMP106:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 12
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP84]], <4 x float>* [[TMP105]], align 4
+; VF-FOUR-CHECK-NEXT:    [[TMP106:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 20
 ; VF-FOUR-CHECK-NEXT:    [[TMP107:%.*]] = bitcast float* [[TMP106]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP87]], <4 x float>* [[TMP107]], align 4
-; VF-FOUR-CHECK-NEXT:    [[TMP108:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 16
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP85]], <4 x float>* [[TMP107]], align 4
+; VF-FOUR-CHECK-NEXT:    [[TMP108:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 24
 ; VF-FOUR-CHECK-NEXT:    [[TMP109:%.*]] = bitcast float* [[TMP108]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP88]], <4 x float>* [[TMP109]], align 4
-; VF-FOUR-CHECK-NEXT:    [[TMP110:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 20
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP86]], <4 x float>* [[TMP109]], align 4
+; VF-FOUR-CHECK-NEXT:    [[TMP110:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 28
 ; VF-FOUR-CHECK-NEXT:    [[TMP111:%.*]] = bitcast float* [[TMP110]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP89]], <4 x float>* [[TMP111]], align 4
-; VF-FOUR-CHECK-NEXT:    [[TMP112:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 24
-; VF-FOUR-CHECK-NEXT:    [[TMP113:%.*]] = bitcast float* [[TMP112]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP90]], <4 x float>* [[TMP113]], align 4
-; VF-FOUR-CHECK-NEXT:    [[TMP114:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 28
-; VF-FOUR-CHECK-NEXT:    [[TMP115:%.*]] = bitcast float* [[TMP114]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP91]], <4 x float>* [[TMP115]], align 4
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP87]], <4 x float>* [[TMP111]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; VF-FOUR-CHECK-NEXT:    [[TMP116:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; VF-FOUR-CHECK-NEXT:    br i1 [[TMP116]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOPID_MV_CM:![0-9]+]]
+; VF-FOUR-CHECK-NEXT:    [[TMP112:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF-FOUR-CHECK-NEXT:    br i1 [[TMP112]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOPID_MV_CM:![0-9]+]]
 ; VF-FOUR-CHECK:       middle.block:
 ; VF-FOUR-CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; VF-FOUR-CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; VF-FOUR-CHECK:       vec.epilog.iter.check:
-; VF-FOUR-CHECK-NEXT:    [[IND_END21:%.*]] = trunc i64 [[N_VEC]] to i32
+; VF-FOUR-CHECK-NEXT:    [[IND_END19:%.*]] = trunc i64 [[N_VEC]] to i32
 ; VF-FOUR-CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; VF-FOUR-CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
 ; VF-FOUR-CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
@@ -929,49 +929,49 @@ define dso_local signext i32 @f2(float* noalias %A, float* noalias %B, i32 signe
 ; VF-FOUR-CHECK-NEXT:    [[IND_END:%.*]] = trunc i64 [[N_VEC17]] to i32
 ; VF-FOUR-CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; VF-FOUR-CHECK:       vec.epilog.vector.body:
-; VF-FOUR-CHECK-NEXT:    [[INDEX18:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT19:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; VF-FOUR-CHECK-NEXT:    [[OFFSET_IDX23:%.*]] = trunc i64 [[INDEX18]] to i32
-; VF-FOUR-CHECK-NEXT:    [[TMP118:%.*]] = add i32 [[OFFSET_IDX23]], 0
-; VF-FOUR-CHECK-NEXT:    [[TMP117:%.*]] = add i64 [[INDEX18]], 0
-; VF-FOUR-CHECK-NEXT:    [[TMP119:%.*]] = xor i32 [[TMP118]], -1
-; VF-FOUR-CHECK-NEXT:    [[TMP120:%.*]] = add i32 [[TMP119]], [[N]]
-; VF-FOUR-CHECK-NEXT:    [[TMP121:%.*]] = sext i32 [[TMP120]] to i64
-; VF-FOUR-CHECK-NEXT:    [[TMP122:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP121]]
-; VF-FOUR-CHECK-NEXT:    [[TMP123:%.*]] = getelementptr inbounds float, float* [[TMP122]], i32 0
-; VF-FOUR-CHECK-NEXT:    [[TMP124:%.*]] = getelementptr inbounds float, float* [[TMP123]], i32 -3
-; VF-FOUR-CHECK-NEXT:    [[TMP125:%.*]] = bitcast float* [[TMP124]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD24:%.*]] = load <4 x float>, <4 x float>* [[TMP125]], align 4
+; VF-FOUR-CHECK-NEXT:    [[OFFSET_IDX23:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT26:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; VF-FOUR-CHECK-NEXT:    [[OFFSET_IDX22:%.*]] = trunc i64 [[OFFSET_IDX23]] to i32
+; VF-FOUR-CHECK-NEXT:    [[TMP113:%.*]] = add i32 [[OFFSET_IDX22]], 0
+; VF-FOUR-CHECK-NEXT:    [[TMP114:%.*]] = add i64 [[OFFSET_IDX23]], 0
+; VF-FOUR-CHECK-NEXT:    [[TMP115:%.*]] = xor i32 [[TMP113]], -1
+; VF-FOUR-CHECK-NEXT:    [[TMP116:%.*]] = add i32 [[TMP115]], [[N]]
+; VF-FOUR-CHECK-NEXT:    [[TMP117:%.*]] = sext i32 [[TMP116]] to i64
+; VF-FOUR-CHECK-NEXT:    [[TMP118:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP117]]
+; VF-FOUR-CHECK-NEXT:    [[TMP119:%.*]] = getelementptr inbounds float, float* [[TMP118]], i32 0
+; VF-FOUR-CHECK-NEXT:    [[TMP120:%.*]] = getelementptr inbounds float, float* [[TMP119]], i32 -3
+; VF-FOUR-CHECK-NEXT:    [[TMP121:%.*]] = bitcast float* [[TMP120]] to <4 x float>*
+; VF-FOUR-CHECK-NEXT:    [[WIDE_LOAD24:%.*]] = load <4 x float>, <4 x float>* [[TMP121]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[REVERSE25:%.*]] = shufflevector <4 x float> [[WIDE_LOAD24]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-FOUR-CHECK-NEXT:    [[TMP126:%.*]] = fadd fast <4 x float> [[REVERSE25]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-FOUR-CHECK-NEXT:    [[TMP127:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP117]]
-; VF-FOUR-CHECK-NEXT:    [[TMP128:%.*]] = getelementptr inbounds float, float* [[TMP127]], i32 0
-; VF-FOUR-CHECK-NEXT:    [[TMP129:%.*]] = bitcast float* [[TMP128]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP126]], <4 x float>* [[TMP129]], align 4
-; VF-FOUR-CHECK-NEXT:    [[INDEX_NEXT19]] = add nuw i64 [[INDEX18]], 4
-; VF-FOUR-CHECK-NEXT:    [[TMP130:%.*]] = icmp eq i64 [[INDEX_NEXT19]], [[N_VEC17]]
-; VF-FOUR-CHECK-NEXT:    br i1 [[TMP130]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOPID_EV_CM:![0-9]+]]
+; VF-FOUR-CHECK-NEXT:    [[TMP122:%.*]] = fadd fast <4 x float> [[REVERSE25]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-FOUR-CHECK-NEXT:    [[TMP123:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP114]]
+; VF-FOUR-CHECK-NEXT:    [[TMP124:%.*]] = getelementptr inbounds float, float* [[TMP123]], i32 0
+; VF-FOUR-CHECK-NEXT:    [[TMP125:%.*]] = bitcast float* [[TMP124]] to <4 x float>*
+; VF-FOUR-CHECK-NEXT:    store <4 x float> [[TMP122]], <4 x float>* [[TMP125]], align 4
+; VF-FOUR-CHECK-NEXT:    [[INDEX_NEXT26]] = add nuw i64 [[OFFSET_IDX23]], 4
+; VF-FOUR-CHECK-NEXT:    [[TMP126:%.*]] = icmp eq i64 [[INDEX_NEXT26]], [[N_VEC17]]
+; VF-FOUR-CHECK-NEXT:    br i1 [[TMP126]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOPID_EV_CM:![0-9]+]]
 ; VF-FOUR-CHECK:       vec.epilog.middle.block:
-; VF-FOUR-CHECK-NEXT:    [[CMP_N22:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC17]]
-; VF-FOUR-CHECK-NEXT:    br i1 [[CMP_N22]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
+; VF-FOUR-CHECK-NEXT:    [[CMP_N20:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC17]]
+; VF-FOUR-CHECK-NEXT:    br i1 [[CMP_N20]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
 ; VF-FOUR-CHECK:       vec.epilog.scalar.ph:
 ; VF-FOUR-CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC17]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ]
-; VF-FOUR-CHECK-NEXT:    [[BC_RESUME_VAL20:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END21]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ]
+; VF-FOUR-CHECK-NEXT:    [[BC_RESUME_VAL18:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END19]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ]
 ; VF-FOUR-CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; VF-FOUR-CHECK:       for.body:
 ; VF-FOUR-CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; VF-FOUR-CHECK-NEXT:    [[I_014:%.*]] = phi i32 [ [[BC_RESUME_VAL20]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; VF-FOUR-CHECK-NEXT:    [[TMP131:%.*]] = xor i32 [[I_014]], -1
-; VF-FOUR-CHECK-NEXT:    [[SUB2:%.*]] = add i32 [[TMP131]], [[N]]
+; VF-FOUR-CHECK-NEXT:    [[I_014:%.*]] = phi i32 [ [[BC_RESUME_VAL18]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; VF-FOUR-CHECK-NEXT:    [[TMP127:%.*]] = xor i32 [[I_014]], -1
+; VF-FOUR-CHECK-NEXT:    [[SUB2:%.*]] = add i32 [[TMP127]], [[N]]
 ; VF-FOUR-CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[SUB2]] to i64
 ; VF-FOUR-CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IDXPROM]]
-; VF-FOUR-CHECK-NEXT:    [[TMP132:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; VF-FOUR-CHECK-NEXT:    [[CONV3:%.*]] = fadd fast float [[TMP132]], 1.000000e+00
+; VF-FOUR-CHECK-NEXT:    [[TMP128:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; VF-FOUR-CHECK-NEXT:    [[CONV3:%.*]] = fadd fast float [[TMP128]], 1.000000e+00
 ; VF-FOUR-CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
 ; VF-FOUR-CHECK-NEXT:    store float [[CONV3]], float* [[ARRAYIDX5]], align 4
 ; VF-FOUR-CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; VF-FOUR-CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_014]], 1
 ; VF-FOUR-CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; VF-FOUR-CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOPID_MS_CM:![0-9]+]]
+; VF-FOUR-CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]]
 ; VF-FOUR-CHECK:       for.end.loopexit.loopexit:
 ; VF-FOUR-CHECK-NEXT:    br label [[FOR_END_LOOPEXIT]]
 ; VF-FOUR-CHECK:       for.end.loopexit:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll
index e33be35..313c7a8 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll
@@ -50,29 +50,29 @@ define i32 @inv_load_conditional(i32* %a, i64 %n, i32* %b, i32 %k) {
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK_NOT_NOT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; CHECK:       vec.epilog.ph:
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT:    [[N_VEC13:%.*]] = and i64 [[SMAX6]], 9223372036854775800
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT17:%.*]] = insertelement <8 x i32*> poison, i32* [[A]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT18:%.*]] = shufflevector <8 x i32*> [[BROADCAST_SPLATINSERT17]], <8 x i32*> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT19:%.*]] = insertelement <8 x i32> poison, i32 [[NTRUNC]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT20:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT19]], <8 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[N_VEC12:%.*]] = and i64 [[SMAX6]], 9223372036854775800
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT16:%.*]] = insertelement <8 x i32*> poison, i32* [[A]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT17:%.*]] = shufflevector <8 x i32*> [[BROADCAST_SPLATINSERT16]], <8 x i32*> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT18:%.*]] = insertelement <8 x i32> poison, i32 [[NTRUNC]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT19:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT18]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT23:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT22:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <8 x i32>*
-; CHECK-NEXT:    store <8 x i32> [[BROADCAST_SPLAT20]], <8 x i32>* [[TMP6]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT23]] = add nuw i64 [[OFFSET_IDX]], 8
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT23]], [[N_VEC13]]
+; CHECK-NEXT:    store <8 x i32> [[BROADCAST_SPLAT19]], <8 x i32>* [[TMP6]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT22]] = add nuw i64 [[OFFSET_IDX]], 8
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT22]], [[N_VEC12]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <8 x i32*> [[BROADCAST_SPLAT18]], zeroinitializer
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER21:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[BROADCAST_SPLAT18]], i32 4, <8 x i1> [[TMP8]], <8 x i32> undef)
-; CHECK-NEXT:    [[PREDPHI22:%.*]] = select <8 x i1> [[TMP8]], <8 x i32> [[WIDE_MASKED_GATHER21]], <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 1>
-; CHECK-NEXT:    [[CMP_N14:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC13]]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[PREDPHI22]], i64 7
-; CHECK-NEXT:    br i1 [[CMP_N14]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <8 x i32*> [[BROADCAST_SPLAT17]], zeroinitializer
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER20:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[BROADCAST_SPLAT17]], i32 4, <8 x i1> [[TMP8]], <8 x i32> undef)
+; CHECK-NEXT:    [[PREDPHI21:%.*]] = select <8 x i1> [[TMP8]], <8 x i32> [[WIDE_MASKED_GATHER20]], <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 1>
+; CHECK-NEXT:    [[CMP_N13:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC12]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <8 x i32> [[PREDPHI21]], i64 7
+; CHECK-NEXT:    br i1 [[CMP_N13]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC12]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll b/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll
index 2cfa1de..19e10b3 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll
@@ -36,22 +36,22 @@ define void @test1(i8 * noalias %src, i8 * noalias %dst) #0 {
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX1]], 0
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT4:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP9]], i32 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <8 x i8>*
-; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x i8>, <8 x i8>* [[TMP11]], align 64
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i8>, <8 x i8>* [[TMP11]], align 64
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, i8* [[TMP12]], i32 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8* [[TMP13]] to <8 x i8>*
-; CHECK-NEXT:    store <8 x i8> [[WIDE_LOAD4]], <8 x i8>* [[TMP14]], align 64
-; CHECK-NEXT:    [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 8
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 16
+; CHECK-NEXT:    store <8 x i8> [[WIDE_LOAD3]], <8 x i8>* [[TMP14]], align 64
+; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[OFFSET_IDX]], 8
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 16
 ; CHECK-NEXT:    br i1 [[TMP15]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
-; CHECK-NEXT:    [[CMP_N3:%.*]] = icmp eq i64 17, 16
-; CHECK-NEXT:    br i1 [[CMP_N3]], label [[EXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N1:%.*]] = icmp eq i64 17, 16
+; CHECK-NEXT:    br i1 [[CMP_N1]], label [[EXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP_MEMCPY_EXPANSION:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
index e48fb15..4014f9e 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll
@@ -35,24 +35,24 @@ define void @foo1(i32* nocapture %A, i32* nocapture readonly %B, i32* nocapture
 ; AVX1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; AVX1:       vector.body:
 ; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; AVX1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; AVX1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]]
-; AVX1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; AVX1-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <8 x i32>*
-; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 4
-; AVX1-NEXT:    [[TMP4:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP0]]
-; AVX1-NEXT:    [[TMP6:%.*]] = getelementptr i32, i32* [[TMP5]], i32 0
-; AVX1-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP7]], i32 4, <8 x i1> [[TMP4]], <8 x i32> poison)
-; AVX1-NEXT:    [[TMP8:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
-; AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP0]]
-; AVX1-NEXT:    [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 0
-; AVX1-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>*
-; AVX1-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP8]], <8 x i32>* [[TMP11]], i32 4, <8 x i1> [[TMP4]])
+; AVX1-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; AVX1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]]
+; AVX1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0
+; AVX1-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>*
+; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP5]], align 4
+; AVX1-NEXT:    [[TMP6:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX1-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP2]]
+; AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i32, i32* [[TMP7]], i32 0
+; AVX1-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>*
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP9]], i32 4, <8 x i1> [[TMP6]], <8 x i32> poison)
+; AVX1-NEXT:    [[TMP10:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
+; AVX1-NEXT:    [[TMP11:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP2]]
+; AVX1-NEXT:    [[TMP12:%.*]] = getelementptr i32, i32* [[TMP11]], i32 0
+; AVX1-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>*
+; AVX1-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP10]], <8 x i32>* [[TMP13]], i32 4, <8 x i1> [[TMP6]])
 ; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; AVX1-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
-; AVX1-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
+; AVX1-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; AVX1:       middle.block:
 ; AVX1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 10000
 ; AVX1-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -62,20 +62,20 @@ define void @foo1(i32* nocapture %A, i32* nocapture readonly %B, i32* nocapture
 ; AVX1:       for.body:
 ; AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
 ; AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
-; AVX1-NEXT:    [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; AVX1-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP13]], 100
+; AVX1-NEXT:    [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX1-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP15]], 100
 ; AVX1-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; AVX1:       if.then:
 ; AVX1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
-; AVX1-NEXT:    [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
-; AVX1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP14]], [[TMP13]]
+; AVX1-NEXT:    [[TMP16:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
+; AVX1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP15]]
 ; AVX1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
 ; AVX1-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX7]], align 4
 ; AVX1-NEXT:    br label [[FOR_INC]]
 ; AVX1:       for.inc:
 ; AVX1-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; AVX1-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
-; AVX1-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; AVX1-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; AVX1:       for.end:
 ; AVX1-NEXT:    ret void
 ;
@@ -96,69 +96,69 @@ define void @foo1(i32* nocapture %A, i32* nocapture readonly %B, i32* nocapture
 ; AVX2-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; AVX2:       vector.body:
 ; AVX2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; AVX2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; AVX2-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 8
-; AVX2-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 16
-; AVX2-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 24
-; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]]
-; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP1]]
+; AVX2-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; AVX2-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 8
+; AVX2-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 16
+; AVX2-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 24
 ; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]]
 ; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]]
-; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
-; AVX2-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>*
-; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4
-; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 8
+; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP4]]
+; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP5]]
+; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0
 ; AVX2-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>*
-; AVX2-NEXT:    [[WIDE_LOAD12:%.*]] = load <8 x i32>, <8 x i32>* [[TMP11]], align 4
-; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 16
+; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP11]], align 4
+; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 8
 ; AVX2-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>*
-; AVX2-NEXT:    [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4
-; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 24
+; AVX2-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4
+; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 16
 ; AVX2-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <8 x i32>*
-; AVX2-NEXT:    [[WIDE_LOAD14:%.*]] = load <8 x i32>, <8 x i32>* [[TMP15]], align 4
-; AVX2-NEXT:    [[TMP16:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX2-NEXT:    [[TMP17:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX2-NEXT:    [[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX2-NEXT:    [[TMP19:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD14]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP0]]
-; AVX2-NEXT:    [[TMP21:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP1]]
+; AVX2-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i32>, <8 x i32>* [[TMP15]], align 4
+; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 24
+; AVX2-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD7:%.*]] = load <8 x i32>, <8 x i32>* [[TMP17]], align 4
+; AVX2-NEXT:    [[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP19:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD5]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP20:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD6]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP21:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD7]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
 ; AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP2]]
 ; AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP3]]
-; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr i32, i32* [[TMP20]], i32 0
-; AVX2-NEXT:    [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <8 x i32>*
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP25]], i32 4, <8 x i1> [[TMP16]], <8 x i32> poison)
-; AVX2-NEXT:    [[TMP26:%.*]] = getelementptr i32, i32* [[TMP20]], i32 8
+; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP4]]
+; AVX2-NEXT:    [[TMP25:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP5]]
+; AVX2-NEXT:    [[TMP26:%.*]] = getelementptr i32, i32* [[TMP22]], i32 0
 ; AVX2-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <8 x i32>*
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP27]], i32 4, <8 x i1> [[TMP17]], <8 x i32> poison)
-; AVX2-NEXT:    [[TMP28:%.*]] = getelementptr i32, i32* [[TMP20]], i32 16
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP27]], i32 4, <8 x i1> [[TMP18]], <8 x i32> poison)
+; AVX2-NEXT:    [[TMP28:%.*]] = getelementptr i32, i32* [[TMP22]], i32 8
 ; AVX2-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <8 x i32>*
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD16:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP29]], i32 4, <8 x i1> [[TMP18]], <8 x i32> poison)
-; AVX2-NEXT:    [[TMP30:%.*]] = getelementptr i32, i32* [[TMP20]], i32 24
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP29]], i32 4, <8 x i1> [[TMP19]], <8 x i32> poison)
+; AVX2-NEXT:    [[TMP30:%.*]] = getelementptr i32, i32* [[TMP22]], i32 16
 ; AVX2-NEXT:    [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <8 x i32>*
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD17:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP31]], i32 4, <8 x i1> [[TMP19]], <8 x i32> poison)
-; AVX2-NEXT:    [[TMP32:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
-; AVX2-NEXT:    [[TMP33:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD15]], [[WIDE_LOAD12]]
-; AVX2-NEXT:    [[TMP34:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD16]], [[WIDE_LOAD13]]
-; AVX2-NEXT:    [[TMP35:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD17]], [[WIDE_LOAD14]]
-; AVX2-NEXT:    [[TMP36:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP0]]
-; AVX2-NEXT:    [[TMP37:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP1]]
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP31]], i32 4, <8 x i1> [[TMP20]], <8 x i32> poison)
+; AVX2-NEXT:    [[TMP32:%.*]] = getelementptr i32, i32* [[TMP22]], i32 24
+; AVX2-NEXT:    [[TMP33:%.*]] = bitcast i32* [[TMP32]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP33]], i32 4, <8 x i1> [[TMP21]], <8 x i32> poison)
+; AVX2-NEXT:    [[TMP34:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
+; AVX2-NEXT:    [[TMP35:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD8]], [[WIDE_LOAD5]]
+; AVX2-NEXT:    [[TMP36:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_LOAD6]]
+; AVX2-NEXT:    [[TMP37:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD10]], [[WIDE_LOAD7]]
 ; AVX2-NEXT:    [[TMP38:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP2]]
 ; AVX2-NEXT:    [[TMP39:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP3]]
-; AVX2-NEXT:    [[TMP40:%.*]] = getelementptr i32, i32* [[TMP36]], i32 0
-; AVX2-NEXT:    [[TMP41:%.*]] = bitcast i32* [[TMP40]] to <8 x i32>*
-; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP32]], <8 x i32>* [[TMP41]], i32 4, <8 x i1> [[TMP16]])
-; AVX2-NEXT:    [[TMP42:%.*]] = getelementptr i32, i32* [[TMP36]], i32 8
+; AVX2-NEXT:    [[TMP40:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP4]]
+; AVX2-NEXT:    [[TMP41:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP5]]
+; AVX2-NEXT:    [[TMP42:%.*]] = getelementptr i32, i32* [[TMP38]], i32 0
 ; AVX2-NEXT:    [[TMP43:%.*]] = bitcast i32* [[TMP42]] to <8 x i32>*
-; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP33]], <8 x i32>* [[TMP43]], i32 4, <8 x i1> [[TMP17]])
-; AVX2-NEXT:    [[TMP44:%.*]] = getelementptr i32, i32* [[TMP36]], i32 16
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP34]], <8 x i32>* [[TMP43]], i32 4, <8 x i1> [[TMP18]])
+; AVX2-NEXT:    [[TMP44:%.*]] = getelementptr i32, i32* [[TMP38]], i32 8
 ; AVX2-NEXT:    [[TMP45:%.*]] = bitcast i32* [[TMP44]] to <8 x i32>*
-; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP34]], <8 x i32>* [[TMP45]], i32 4, <8 x i1> [[TMP18]])
-; AVX2-NEXT:    [[TMP46:%.*]] = getelementptr i32, i32* [[TMP36]], i32 24
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP35]], <8 x i32>* [[TMP45]], i32 4, <8 x i1> [[TMP19]])
+; AVX2-NEXT:    [[TMP46:%.*]] = getelementptr i32, i32* [[TMP38]], i32 16
 ; AVX2-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <8 x i32>*
-; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP35]], <8 x i32>* [[TMP47]], i32 4, <8 x i1> [[TMP19]])
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP36]], <8 x i32>* [[TMP47]], i32 4, <8 x i1> [[TMP20]])
+; AVX2-NEXT:    [[TMP48:%.*]] = getelementptr i32, i32* [[TMP38]], i32 24
+; AVX2-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <8 x i32>*
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP37]], <8 x i32>* [[TMP49]], i32 4, <8 x i1> [[TMP21]])
 ; AVX2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; AVX2-NEXT:    [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
-; AVX2-NEXT:    br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; AVX2-NEXT:    [[TMP50:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
+; AVX2-NEXT:    br i1 [[TMP50]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; AVX2:       middle.block:
 ; AVX2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 9984
 ; AVX2-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -168,20 +168,20 @@ define void @foo1(i32* nocapture %A, i32* nocapture readonly %B, i32* nocapture
 ; AVX2:       for.body:
 ; AVX2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
 ; AVX2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
-; AVX2-NEXT:    [[TMP49:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; AVX2-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP49]], 100
+; AVX2-NEXT:    [[TMP51:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX2-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP51]], 100
 ; AVX2-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; AVX2:       if.then:
 ; AVX2-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
-; AVX2-NEXT:    [[TMP50:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
-; AVX2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP50]], [[TMP49]]
+; AVX2-NEXT:    [[TMP52:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
+; AVX2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP52]], [[TMP51]]
 ; AVX2-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
 ; AVX2-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX7]], align 4
 ; AVX2-NEXT:    br label [[FOR_INC]]
 ; AVX2:       for.inc:
 ; AVX2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; AVX2-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
-; AVX2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; AVX2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; AVX2:       for.end:
 ; AVX2-NEXT:    ret void
 ;
@@ -204,69 +204,69 @@ define void @foo1(i32* nocapture %A, i32* nocapture readonly %B, i32* nocapture
 ; AVX512-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; AVX512:       vector.body:
 ; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; AVX512-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; AVX512-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 16
-; AVX512-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 32
-; AVX512-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 48
-; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]]
-; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP1]]
+; AVX512-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; AVX512-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 16
+; AVX512-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 32
+; AVX512-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 48
 ; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]]
 ; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]]
-; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
-; AVX512-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <16 x i32>*
-; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP9]], align 4
-; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 16
+; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP4]]
+; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP5]]
+; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0
 ; AVX512-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <16 x i32>*
-; AVX512-NEXT:    [[WIDE_LOAD12:%.*]] = load <16 x i32>, <16 x i32>* [[TMP11]], align 4
-; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 32
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP11]], align 4
+; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 16
 ; AVX512-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <16 x i32>*
-; AVX512-NEXT:    [[WIDE_LOAD13:%.*]] = load <16 x i32>, <16 x i32>* [[TMP13]], align 4
-; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 48
+; AVX512-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i32>, <16 x i32>* [[TMP13]], align 4
+; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 32
 ; AVX512-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <16 x i32>*
-; AVX512-NEXT:    [[WIDE_LOAD14:%.*]] = load <16 x i32>, <16 x i32>* [[TMP15]], align 4
-; AVX512-NEXT:    [[TMP16:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX512-NEXT:    [[TMP17:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD12]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX512-NEXT:    [[TMP18:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD13]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX512-NEXT:    [[TMP19:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD14]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP0]]
-; AVX512-NEXT:    [[TMP21:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP1]]
+; AVX512-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i32>, <16 x i32>* [[TMP15]], align 4
+; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 48
+; AVX512-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD7:%.*]] = load <16 x i32>, <16 x i32>* [[TMP17]], align 4
+; AVX512-NEXT:    [[TMP18:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP19:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD5]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP20:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD6]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP21:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD7]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
 ; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP2]]
 ; AVX512-NEXT:    [[TMP23:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP3]]
-; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr i32, i32* [[TMP20]], i32 0
-; AVX512-NEXT:    [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <16 x i32>*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP25]], i32 4, <16 x i1> [[TMP16]], <16 x i32> poison)
-; AVX512-NEXT:    [[TMP26:%.*]] = getelementptr i32, i32* [[TMP20]], i32 16
+; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP4]]
+; AVX512-NEXT:    [[TMP25:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP5]]
+; AVX512-NEXT:    [[TMP26:%.*]] = getelementptr i32, i32* [[TMP22]], i32 0
 ; AVX512-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <16 x i32>*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP27]], i32 4, <16 x i1> [[TMP17]], <16 x i32> poison)
-; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr i32, i32* [[TMP20]], i32 32
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP27]], i32 4, <16 x i1> [[TMP18]], <16 x i32> poison)
+; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr i32, i32* [[TMP22]], i32 16
 ; AVX512-NEXT:    [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <16 x i32>*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD16:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP29]], i32 4, <16 x i1> [[TMP18]], <16 x i32> poison)
-; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr i32, i32* [[TMP20]], i32 48
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP29]], i32 4, <16 x i1> [[TMP19]], <16 x i32> poison)
+; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr i32, i32* [[TMP22]], i32 32
 ; AVX512-NEXT:    [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <16 x i32>*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD17:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP31]], i32 4, <16 x i1> [[TMP19]], <16 x i32> poison)
-; AVX512-NEXT:    [[TMP32:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
-; AVX512-NEXT:    [[TMP33:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD15]], [[WIDE_LOAD12]]
-; AVX512-NEXT:    [[TMP34:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD16]], [[WIDE_LOAD13]]
-; AVX512-NEXT:    [[TMP35:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD17]], [[WIDE_LOAD14]]
-; AVX512-NEXT:    [[TMP36:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP0]]
-; AVX512-NEXT:    [[TMP37:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP1]]
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP31]], i32 4, <16 x i1> [[TMP20]], <16 x i32> poison)
+; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr i32, i32* [[TMP22]], i32 48
+; AVX512-NEXT:    [[TMP33:%.*]] = bitcast i32* [[TMP32]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP33]], i32 4, <16 x i1> [[TMP21]], <16 x i32> poison)
+; AVX512-NEXT:    [[TMP34:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
+; AVX512-NEXT:    [[TMP35:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD8]], [[WIDE_LOAD5]]
+; AVX512-NEXT:    [[TMP36:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_LOAD6]]
+; AVX512-NEXT:    [[TMP37:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD10]], [[WIDE_LOAD7]]
 ; AVX512-NEXT:    [[TMP38:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP2]]
 ; AVX512-NEXT:    [[TMP39:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP3]]
-; AVX512-NEXT:    [[TMP40:%.*]] = getelementptr i32, i32* [[TMP36]], i32 0
-; AVX512-NEXT:    [[TMP41:%.*]] = bitcast i32* [[TMP40]] to <16 x i32>*
-; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP32]], <16 x i32>* [[TMP41]], i32 4, <16 x i1> [[TMP16]])
-; AVX512-NEXT:    [[TMP42:%.*]] = getelementptr i32, i32* [[TMP36]], i32 16
+; AVX512-NEXT:    [[TMP40:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP4]]
+; AVX512-NEXT:    [[TMP41:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP5]]
+; AVX512-NEXT:    [[TMP42:%.*]] = getelementptr i32, i32* [[TMP38]], i32 0
 ; AVX512-NEXT:    [[TMP43:%.*]] = bitcast i32* [[TMP42]] to <16 x i32>*
-; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP33]], <16 x i32>* [[TMP43]], i32 4, <16 x i1> [[TMP17]])
-; AVX512-NEXT:    [[TMP44:%.*]] = getelementptr i32, i32* [[TMP36]], i32 32
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP34]], <16 x i32>* [[TMP43]], i32 4, <16 x i1> [[TMP18]])
+; AVX512-NEXT:    [[TMP44:%.*]] = getelementptr i32, i32* [[TMP38]], i32 16
 ; AVX512-NEXT:    [[TMP45:%.*]] = bitcast i32* [[TMP44]] to <16 x i32>*
-; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP34]], <16 x i32>* [[TMP45]], i32 4, <16 x i1> [[TMP18]])
-; AVX512-NEXT:    [[TMP46:%.*]] = getelementptr i32, i32* [[TMP36]], i32 48
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP35]], <16 x i32>* [[TMP45]], i32 4, <16 x i1> [[TMP19]])
+; AVX512-NEXT:    [[TMP46:%.*]] = getelementptr i32, i32* [[TMP38]], i32 32
 ; AVX512-NEXT:    [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <16 x i32>*
-; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP35]], <16 x i32>* [[TMP47]], i32 4, <16 x i1> [[TMP19]])
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP36]], <16 x i32>* [[TMP47]], i32 4, <16 x i1> [[TMP20]])
+; AVX512-NEXT:    [[TMP48:%.*]] = getelementptr i32, i32* [[TMP38]], i32 48
+; AVX512-NEXT:    [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <16 x i32>*
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP37]], <16 x i32>* [[TMP49]], i32 4, <16 x i1> [[TMP21]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
-; AVX512-NEXT:    [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
-; AVX512-NEXT:    br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; AVX512-NEXT:    [[TMP50:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
+; AVX512-NEXT:    br i1 [[TMP50]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; AVX512:       middle.block:
 ; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 9984
 ; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
@@ -276,48 +276,48 @@ define void @foo1(i32* nocapture %A, i32* nocapture readonly %B, i32* nocapture
 ; AVX512-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; AVX512-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; AVX512:       vec.epilog.vector.body:
-; AVX512-NEXT:    [[INDEX19:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT22:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; AVX512-NEXT:    [[TMP49:%.*]] = add i64 [[INDEX19]], 0
-; AVX512-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP49]]
-; AVX512-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[TMP50]], i32 0
-; AVX512-NEXT:    [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <8 x i32>*
-; AVX512-NEXT:    [[WIDE_LOAD20:%.*]] = load <8 x i32>, <8 x i32>* [[TMP52]], align 4
-; AVX512-NEXT:    [[TMP53:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD20]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX512-NEXT:    [[TMP54:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP49]]
-; AVX512-NEXT:    [[TMP55:%.*]] = getelementptr i32, i32* [[TMP54]], i32 0
-; AVX512-NEXT:    [[TMP56:%.*]] = bitcast i32* [[TMP55]] to <8 x i32>*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD21:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP56]], i32 4, <8 x i1> [[TMP53]], <8 x i32> poison)
-; AVX512-NEXT:    [[TMP57:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD21]], [[WIDE_LOAD20]]
-; AVX512-NEXT:    [[TMP58:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP49]]
-; AVX512-NEXT:    [[TMP59:%.*]] = getelementptr i32, i32* [[TMP58]], i32 0
-; AVX512-NEXT:    [[TMP60:%.*]] = bitcast i32* [[TMP59]] to <8 x i32>*
-; AVX512-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP57]], <8 x i32>* [[TMP60]], i32 4, <8 x i1> [[TMP53]])
-; AVX512-NEXT:    [[INDEX_NEXT22]] = add nuw i64 [[INDEX19]], 8
-; AVX512-NEXT:    [[TMP61:%.*]] = icmp eq i64 [[INDEX_NEXT22]], 10000
-; AVX512-NEXT:    br i1 [[TMP61]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; AVX512-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; AVX512-NEXT:    [[TMP51:%.*]] = add i64 [[OFFSET_IDX]], 0
+; AVX512-NEXT:    [[TMP52:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP51]]
+; AVX512-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[TMP52]], i32 0
+; AVX512-NEXT:    [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <8 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32>* [[TMP54]], align 4
+; AVX512-NEXT:    [[TMP55:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP56:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP51]]
+; AVX512-NEXT:    [[TMP57:%.*]] = getelementptr i32, i32* [[TMP56]], i32 0
+; AVX512-NEXT:    [[TMP58:%.*]] = bitcast i32* [[TMP57]] to <8 x i32>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP58]], i32 4, <8 x i1> [[TMP55]], <8 x i32> poison)
+; AVX512-NEXT:    [[TMP59:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD14]], [[WIDE_LOAD13]]
+; AVX512-NEXT:    [[TMP60:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP51]]
+; AVX512-NEXT:    [[TMP61:%.*]] = getelementptr i32, i32* [[TMP60]], i32 0
+; AVX512-NEXT:    [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <8 x i32>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP59]], <8 x i32>* [[TMP62]], i32 4, <8 x i1> [[TMP55]])
+; AVX512-NEXT:    [[INDEX_NEXT15]] = add nuw i64 [[OFFSET_IDX]], 8
+; AVX512-NEXT:    [[TMP63:%.*]] = icmp eq i64 [[INDEX_NEXT15]], 10000
+; AVX512-NEXT:    br i1 [[TMP63]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; AVX512:       vec.epilog.middle.block:
-; AVX512-NEXT:    [[CMP_N18:%.*]] = icmp eq i64 10000, 10000
-; AVX512-NEXT:    br i1 [[CMP_N18]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
+; AVX512-NEXT:    [[CMP_N11:%.*]] = icmp eq i64 10000, 10000
+; AVX512-NEXT:    br i1 [[CMP_N11]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
 ; AVX512:       vec.epilog.scalar.ph:
 ; AVX512-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
 ; AVX512-NEXT:    br label [[FOR_BODY:%.*]]
 ; AVX512:       for.body:
 ; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
 ; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
-; AVX512-NEXT:    [[TMP62:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; AVX512-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP62]], 100
+; AVX512-NEXT:    [[TMP64:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX512-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP64]], 100
 ; AVX512-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; AVX512:       if.then:
 ; AVX512-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
-; AVX512-NEXT:    [[TMP63:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
-; AVX512-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP63]], [[TMP62]]
+; AVX512-NEXT:    [[TMP65:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
+; AVX512-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP65]], [[TMP64]]
 ; AVX512-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
 ; AVX512-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX7]], align 4
 ; AVX512-NEXT:    br label [[FOR_INC]]
 ; AVX512:       for.inc:
 ; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; AVX512-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
-; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; AVX512:       for.end.loopexit:
 ; AVX512-NEXT:    br label [[FOR_END]]
 ; AVX512:       for.end:
@@ -370,24 +370,24 @@ define void @foo1_addrspace1(i32 addrspace(1)* nocapture %A, i32 addrspace(1)* n
 ; AVX1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; AVX1:       vector.body:
 ; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; AVX1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; AVX1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP0]]
-; AVX1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP1]], i32 0
-; AVX1-NEXT:    [[TMP3:%.*]] = bitcast i32 addrspace(1)* [[TMP2]] to <8 x i32> addrspace(1)*
-; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP3]], align 4
-; AVX1-NEXT:    [[TMP4:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX1-NEXT:    [[TMP5:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP0]]
-; AVX1-NEXT:    [[TMP6:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP5]], i32 0
-; AVX1-NEXT:    [[TMP7:%.*]] = bitcast i32 addrspace(1)* [[TMP6]] to <8 x i32> addrspace(1)*
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP7]], i32 4, <8 x i1> [[TMP4]], <8 x i32> poison)
-; AVX1-NEXT:    [[TMP8:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
-; AVX1-NEXT:    [[TMP9:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP0]]
-; AVX1-NEXT:    [[TMP10:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP9]], i32 0
-; AVX1-NEXT:    [[TMP11:%.*]] = bitcast i32 addrspace(1)* [[TMP10]] to <8 x i32> addrspace(1)*
-; AVX1-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP8]], <8 x i32> addrspace(1)* [[TMP11]], i32 4, <8 x i1> [[TMP4]])
+; AVX1-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; AVX1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP2]]
+; AVX1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP3]], i32 0
+; AVX1-NEXT:    [[TMP5:%.*]] = bitcast i32 addrspace(1)* [[TMP4]] to <8 x i32> addrspace(1)*
+; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP5]], align 4
+; AVX1-NEXT:    [[TMP6:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX1-NEXT:    [[TMP7:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP2]]
+; AVX1-NEXT:    [[TMP8:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP7]], i32 0
+; AVX1-NEXT:    [[TMP9:%.*]] = bitcast i32 addrspace(1)* [[TMP8]] to <8 x i32> addrspace(1)*
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP9]], i32 4, <8 x i1> [[TMP6]], <8 x i32> poison)
+; AVX1-NEXT:    [[TMP10:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
+; AVX1-NEXT:    [[TMP11:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP2]]
+; AVX1-NEXT:    [[TMP12:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP11]], i32 0
+; AVX1-NEXT:    [[TMP13:%.*]] = bitcast i32 addrspace(1)* [[TMP12]] to <8 x i32> addrspace(1)*
+; AVX1-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP10]], <8 x i32> addrspace(1)* [[TMP13]], i32 4, <8 x i1> [[TMP6]])
 ; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; AVX1-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
-; AVX1-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; AVX1-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
+; AVX1-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; AVX1:       middle.block:
 ; AVX1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 10000
 ; AVX1-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -397,20 +397,20 @@ define void @foo1_addrspace1(i32 addrspace(1)* nocapture %A, i32 addrspace(1)* n
 ; AVX1:       for.body:
 ; AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
 ; AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV]]
-; AVX1-NEXT:    [[TMP13:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4
-; AVX1-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP13]], 100
+; AVX1-NEXT:    [[TMP15:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4
+; AVX1-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP15]], 100
 ; AVX1-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; AVX1:       if.then:
 ; AVX1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV]]
-; AVX1-NEXT:    [[TMP14:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3]], align 4
-; AVX1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP14]], [[TMP13]]
+; AVX1-NEXT:    [[TMP16:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3]], align 4
+; AVX1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP15]]
 ; AVX1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV]]
 ; AVX1-NEXT:    store i32 [[ADD]], i32 addrspace(1)* [[ARRAYIDX7]], align 4
 ; AVX1-NEXT:    br label [[FOR_INC]]
 ; AVX1:       for.inc:
 ; AVX1-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; AVX1-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
-; AVX1-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; AVX1-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; AVX1:       for.end:
 ; AVX1-NEXT:    ret void
 ;
@@ -431,69 +431,69 @@ define void @foo1_addrspace1(i32 addrspace(1)* nocapture %A, i32 addrspace(1)* n
 ; AVX2-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; AVX2:       vector.body:
 ; AVX2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; AVX2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; AVX2-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 8
-; AVX2-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 16
-; AVX2-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 24
-; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP0]]
-; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP1]]
+; AVX2-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; AVX2-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 8
+; AVX2-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 16
+; AVX2-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 24
 ; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP2]]
 ; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP3]]
-; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 0
-; AVX2-NEXT:    [[TMP9:%.*]] = bitcast i32 addrspace(1)* [[TMP8]] to <8 x i32> addrspace(1)*
-; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP9]], align 4
-; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 8
+; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP4]]
+; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP5]]
+; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP6]], i32 0
 ; AVX2-NEXT:    [[TMP11:%.*]] = bitcast i32 addrspace(1)* [[TMP10]] to <8 x i32> addrspace(1)*
-; AVX2-NEXT:    [[WIDE_LOAD12:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP11]], align 4
-; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 16
+; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP11]], align 4
+; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP6]], i32 8
 ; AVX2-NEXT:    [[TMP13:%.*]] = bitcast i32 addrspace(1)* [[TMP12]] to <8 x i32> addrspace(1)*
-; AVX2-NEXT:    [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP13]], align 4
-; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 24
+; AVX2-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP13]], align 4
+; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP6]], i32 16
 ; AVX2-NEXT:    [[TMP15:%.*]] = bitcast i32 addrspace(1)* [[TMP14]] to <8 x i32> addrspace(1)*
-; AVX2-NEXT:    [[WIDE_LOAD14:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP15]], align 4
-; AVX2-NEXT:    [[TMP16:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX2-NEXT:    [[TMP17:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX2-NEXT:    [[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX2-NEXT:    [[TMP19:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD14]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP0]]
-; AVX2-NEXT:    [[TMP21:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP1]]
+; AVX2-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP15]], align 4
+; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP6]], i32 24
+; AVX2-NEXT:    [[TMP17:%.*]] = bitcast i32 addrspace(1)* [[TMP16]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    [[WIDE_LOAD7:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP17]], align 4
+; AVX2-NEXT:    [[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP19:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD5]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP20:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD6]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP21:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD7]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
 ; AVX2-NEXT:    [[TMP22:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP2]]
 ; AVX2-NEXT:    [[TMP23:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP3]]
-; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP20]], i32 0
-; AVX2-NEXT:    [[TMP25:%.*]] = bitcast i32 addrspace(1)* [[TMP24]] to <8 x i32> addrspace(1)*
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP25]], i32 4, <8 x i1> [[TMP16]], <8 x i32> poison)
-; AVX2-NEXT:    [[TMP26:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP20]], i32 8
+; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP4]]
+; AVX2-NEXT:    [[TMP25:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP5]]
+; AVX2-NEXT:    [[TMP26:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP22]], i32 0
 ; AVX2-NEXT:    [[TMP27:%.*]] = bitcast i32 addrspace(1)* [[TMP26]] to <8 x i32> addrspace(1)*
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP27]], i32 4, <8 x i1> [[TMP17]], <8 x i32> poison)
-; AVX2-NEXT:    [[TMP28:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP20]], i32 16
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP27]], i32 4, <8 x i1> [[TMP18]], <8 x i32> poison)
+; AVX2-NEXT:    [[TMP28:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP22]], i32 8
 ; AVX2-NEXT:    [[TMP29:%.*]] = bitcast i32 addrspace(1)* [[TMP28]] to <8 x i32> addrspace(1)*
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD16:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP29]], i32 4, <8 x i1> [[TMP18]], <8 x i32> poison)
-; AVX2-NEXT:    [[TMP30:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP20]], i32 24
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP29]], i32 4, <8 x i1> [[TMP19]], <8 x i32> poison)
+; AVX2-NEXT:    [[TMP30:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP22]], i32 16
 ; AVX2-NEXT:    [[TMP31:%.*]] = bitcast i32 addrspace(1)* [[TMP30]] to <8 x i32> addrspace(1)*
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD17:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP31]], i32 4, <8 x i1> [[TMP19]], <8 x i32> poison)
-; AVX2-NEXT:    [[TMP32:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
-; AVX2-NEXT:    [[TMP33:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD15]], [[WIDE_LOAD12]]
-; AVX2-NEXT:    [[TMP34:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD16]], [[WIDE_LOAD13]]
-; AVX2-NEXT:    [[TMP35:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD17]], [[WIDE_LOAD14]]
-; AVX2-NEXT:    [[TMP36:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP0]]
-; AVX2-NEXT:    [[TMP37:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP1]]
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP31]], i32 4, <8 x i1> [[TMP20]], <8 x i32> poison)
+; AVX2-NEXT:    [[TMP32:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP22]], i32 24
+; AVX2-NEXT:    [[TMP33:%.*]] = bitcast i32 addrspace(1)* [[TMP32]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP33]], i32 4, <8 x i1> [[TMP21]], <8 x i32> poison)
+; AVX2-NEXT:    [[TMP34:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
+; AVX2-NEXT:    [[TMP35:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD8]], [[WIDE_LOAD5]]
+; AVX2-NEXT:    [[TMP36:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_LOAD6]]
+; AVX2-NEXT:    [[TMP37:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD10]], [[WIDE_LOAD7]]
 ; AVX2-NEXT:    [[TMP38:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP2]]
 ; AVX2-NEXT:    [[TMP39:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP3]]
-; AVX2-NEXT:    [[TMP40:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP36]], i32 0
-; AVX2-NEXT:    [[TMP41:%.*]] = bitcast i32 addrspace(1)* [[TMP40]] to <8 x i32> addrspace(1)*
-; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP32]], <8 x i32> addrspace(1)* [[TMP41]], i32 4, <8 x i1> [[TMP16]])
-; AVX2-NEXT:    [[TMP42:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP36]], i32 8
+; AVX2-NEXT:    [[TMP40:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP4]]
+; AVX2-NEXT:    [[TMP41:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP5]]
+; AVX2-NEXT:    [[TMP42:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP38]], i32 0
 ; AVX2-NEXT:    [[TMP43:%.*]] = bitcast i32 addrspace(1)* [[TMP42]] to <8 x i32> addrspace(1)*
-; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP33]], <8 x i32> addrspace(1)* [[TMP43]], i32 4, <8 x i1> [[TMP17]])
-; AVX2-NEXT:    [[TMP44:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP36]], i32 16
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP34]], <8 x i32> addrspace(1)* [[TMP43]], i32 4, <8 x i1> [[TMP18]])
+; AVX2-NEXT:    [[TMP44:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP38]], i32 8
 ; AVX2-NEXT:    [[TMP45:%.*]] = bitcast i32 addrspace(1)* [[TMP44]] to <8 x i32> addrspace(1)*
-; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP34]], <8 x i32> addrspace(1)* [[TMP45]], i32 4, <8 x i1> [[TMP18]])
-; AVX2-NEXT:    [[TMP46:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP36]], i32 24
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP35]], <8 x i32> addrspace(1)* [[TMP45]], i32 4, <8 x i1> [[TMP19]])
+; AVX2-NEXT:    [[TMP46:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP38]], i32 16
 ; AVX2-NEXT:    [[TMP47:%.*]] = bitcast i32 addrspace(1)* [[TMP46]] to <8 x i32> addrspace(1)*
-; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP35]], <8 x i32> addrspace(1)* [[TMP47]], i32 4, <8 x i1> [[TMP19]])
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP36]], <8 x i32> addrspace(1)* [[TMP47]], i32 4, <8 x i1> [[TMP20]])
+; AVX2-NEXT:    [[TMP48:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP38]], i32 24
+; AVX2-NEXT:    [[TMP49:%.*]] = bitcast i32 addrspace(1)* [[TMP48]] to <8 x i32> addrspace(1)*
+; AVX2-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP37]], <8 x i32> addrspace(1)* [[TMP49]], i32 4, <8 x i1> [[TMP21]])
 ; AVX2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; AVX2-NEXT:    [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
-; AVX2-NEXT:    br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; AVX2-NEXT:    [[TMP50:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
+; AVX2-NEXT:    br i1 [[TMP50]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; AVX2:       middle.block:
 ; AVX2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 9984
 ; AVX2-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -503,20 +503,20 @@ define void @foo1_addrspace1(i32 addrspace(1)* nocapture %A, i32 addrspace(1)* n
 ; AVX2:       for.body:
 ; AVX2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
 ; AVX2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV]]
-; AVX2-NEXT:    [[TMP49:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4
-; AVX2-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP49]], 100
+; AVX2-NEXT:    [[TMP51:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4
+; AVX2-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP51]], 100
 ; AVX2-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; AVX2:       if.then:
 ; AVX2-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV]]
-; AVX2-NEXT:    [[TMP50:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3]], align 4
-; AVX2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP50]], [[TMP49]]
+; AVX2-NEXT:    [[TMP52:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3]], align 4
+; AVX2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP52]], [[TMP51]]
 ; AVX2-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV]]
 ; AVX2-NEXT:    store i32 [[ADD]], i32 addrspace(1)* [[ARRAYIDX7]], align 4
 ; AVX2-NEXT:    br label [[FOR_INC]]
 ; AVX2:       for.inc:
 ; AVX2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; AVX2-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
-; AVX2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; AVX2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; AVX2:       for.end:
 ; AVX2-NEXT:    ret void
 ;
@@ -539,69 +539,69 @@ define void @foo1_addrspace1(i32 addrspace(1)* nocapture %A, i32 addrspace(1)* n
 ; AVX512-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; AVX512:       vector.body:
 ; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; AVX512-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; AVX512-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 16
-; AVX512-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 32
-; AVX512-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 48
-; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP0]]
-; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP1]]
+; AVX512-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; AVX512-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 16
+; AVX512-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 32
+; AVX512-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 48
 ; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP2]]
 ; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP3]]
-; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 0
-; AVX512-NEXT:    [[TMP9:%.*]] = bitcast i32 addrspace(1)* [[TMP8]] to <16 x i32> addrspace(1)*
-; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP9]], align 4
-; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 16
+; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP4]]
+; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP5]]
+; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP6]], i32 0
 ; AVX512-NEXT:    [[TMP11:%.*]] = bitcast i32 addrspace(1)* [[TMP10]] to <16 x i32> addrspace(1)*
-; AVX512-NEXT:    [[WIDE_LOAD12:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP11]], align 4
-; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 32
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP11]], align 4
+; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP6]], i32 16
 ; AVX512-NEXT:    [[TMP13:%.*]] = bitcast i32 addrspace(1)* [[TMP12]] to <16 x i32> addrspace(1)*
-; AVX512-NEXT:    [[WIDE_LOAD13:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP13]], align 4
-; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 48
+; AVX512-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP13]], align 4
+; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP6]], i32 32
 ; AVX512-NEXT:    [[TMP15:%.*]] = bitcast i32 addrspace(1)* [[TMP14]] to <16 x i32> addrspace(1)*
-; AVX512-NEXT:    [[WIDE_LOAD14:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP15]], align 4
-; AVX512-NEXT:    [[TMP16:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX512-NEXT:    [[TMP17:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD12]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX512-NEXT:    [[TMP18:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD13]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX512-NEXT:    [[TMP19:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD14]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP0]]
-; AVX512-NEXT:    [[TMP21:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP1]]
+; AVX512-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP15]], align 4
+; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP6]], i32 48
+; AVX512-NEXT:    [[TMP17:%.*]] = bitcast i32 addrspace(1)* [[TMP16]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    [[WIDE_LOAD7:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP17]], align 4
+; AVX512-NEXT:    [[TMP18:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP19:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD5]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP20:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD6]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP21:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD7]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
 ; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP2]]
 ; AVX512-NEXT:    [[TMP23:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP3]]
-; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP20]], i32 0
-; AVX512-NEXT:    [[TMP25:%.*]] = bitcast i32 addrspace(1)* [[TMP24]] to <16 x i32> addrspace(1)*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP25]], i32 4, <16 x i1> [[TMP16]], <16 x i32> poison)
-; AVX512-NEXT:    [[TMP26:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP20]], i32 16
+; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP4]]
+; AVX512-NEXT:    [[TMP25:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP5]]
+; AVX512-NEXT:    [[TMP26:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP22]], i32 0
 ; AVX512-NEXT:    [[TMP27:%.*]] = bitcast i32 addrspace(1)* [[TMP26]] to <16 x i32> addrspace(1)*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP27]], i32 4, <16 x i1> [[TMP17]], <16 x i32> poison)
-; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP20]], i32 32
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP27]], i32 4, <16 x i1> [[TMP18]], <16 x i32> poison)
+; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP22]], i32 16
 ; AVX512-NEXT:    [[TMP29:%.*]] = bitcast i32 addrspace(1)* [[TMP28]] to <16 x i32> addrspace(1)*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD16:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP29]], i32 4, <16 x i1> [[TMP18]], <16 x i32> poison)
-; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP20]], i32 48
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP29]], i32 4, <16 x i1> [[TMP19]], <16 x i32> poison)
+; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP22]], i32 32
 ; AVX512-NEXT:    [[TMP31:%.*]] = bitcast i32 addrspace(1)* [[TMP30]] to <16 x i32> addrspace(1)*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD17:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP31]], i32 4, <16 x i1> [[TMP19]], <16 x i32> poison)
-; AVX512-NEXT:    [[TMP32:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
-; AVX512-NEXT:    [[TMP33:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD15]], [[WIDE_LOAD12]]
-; AVX512-NEXT:    [[TMP34:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD16]], [[WIDE_LOAD13]]
-; AVX512-NEXT:    [[TMP35:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD17]], [[WIDE_LOAD14]]
-; AVX512-NEXT:    [[TMP36:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP0]]
-; AVX512-NEXT:    [[TMP37:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP1]]
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP31]], i32 4, <16 x i1> [[TMP20]], <16 x i32> poison)
+; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP22]], i32 48
+; AVX512-NEXT:    [[TMP33:%.*]] = bitcast i32 addrspace(1)* [[TMP32]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP33]], i32 4, <16 x i1> [[TMP21]], <16 x i32> poison)
+; AVX512-NEXT:    [[TMP34:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]]
+; AVX512-NEXT:    [[TMP35:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD8]], [[WIDE_LOAD5]]
+; AVX512-NEXT:    [[TMP36:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_LOAD6]]
+; AVX512-NEXT:    [[TMP37:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD10]], [[WIDE_LOAD7]]
 ; AVX512-NEXT:    [[TMP38:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP2]]
 ; AVX512-NEXT:    [[TMP39:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP3]]
-; AVX512-NEXT:    [[TMP40:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP36]], i32 0
-; AVX512-NEXT:    [[TMP41:%.*]] = bitcast i32 addrspace(1)* [[TMP40]] to <16 x i32> addrspace(1)*
-; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP32]], <16 x i32> addrspace(1)* [[TMP41]], i32 4, <16 x i1> [[TMP16]])
-; AVX512-NEXT:    [[TMP42:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP36]], i32 16
+; AVX512-NEXT:    [[TMP40:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP4]]
+; AVX512-NEXT:    [[TMP41:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP5]]
+; AVX512-NEXT:    [[TMP42:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP38]], i32 0
 ; AVX512-NEXT:    [[TMP43:%.*]] = bitcast i32 addrspace(1)* [[TMP42]] to <16 x i32> addrspace(1)*
-; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP33]], <16 x i32> addrspace(1)* [[TMP43]], i32 4, <16 x i1> [[TMP17]])
-; AVX512-NEXT:    [[TMP44:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP36]], i32 32
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP34]], <16 x i32> addrspace(1)* [[TMP43]], i32 4, <16 x i1> [[TMP18]])
+; AVX512-NEXT:    [[TMP44:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP38]], i32 16
 ; AVX512-NEXT:    [[TMP45:%.*]] = bitcast i32 addrspace(1)* [[TMP44]] to <16 x i32> addrspace(1)*
-; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP34]], <16 x i32> addrspace(1)* [[TMP45]], i32 4, <16 x i1> [[TMP18]])
-; AVX512-NEXT:    [[TMP46:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP36]], i32 48
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP35]], <16 x i32> addrspace(1)* [[TMP45]], i32 4, <16 x i1> [[TMP19]])
+; AVX512-NEXT:    [[TMP46:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP38]], i32 32
 ; AVX512-NEXT:    [[TMP47:%.*]] = bitcast i32 addrspace(1)* [[TMP46]] to <16 x i32> addrspace(1)*
-; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP35]], <16 x i32> addrspace(1)* [[TMP47]], i32 4, <16 x i1> [[TMP19]])
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP36]], <16 x i32> addrspace(1)* [[TMP47]], i32 4, <16 x i1> [[TMP20]])
+; AVX512-NEXT:    [[TMP48:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP38]], i32 48
+; AVX512-NEXT:    [[TMP49:%.*]] = bitcast i32 addrspace(1)* [[TMP48]] to <16 x i32> addrspace(1)*
+; AVX512-NEXT:    call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP37]], <16 x i32> addrspace(1)* [[TMP49]], i32 4, <16 x i1> [[TMP21]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
-; AVX512-NEXT:    [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
-; AVX512-NEXT:    br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; AVX512-NEXT:    [[TMP50:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
+; AVX512-NEXT:    br i1 [[TMP50]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; AVX512:       middle.block:
 ; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 9984
 ; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
@@ -611,48 +611,48 @@ define void @foo1_addrspace1(i32 addrspace(1)* nocapture %A, i32 addrspace(1)* n
 ; AVX512-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; AVX512-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; AVX512:       vec.epilog.vector.body:
-; AVX512-NEXT:    [[INDEX19:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT22:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; AVX512-NEXT:    [[TMP49:%.*]] = add i64 [[INDEX19]], 0
-; AVX512-NEXT:    [[TMP50:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP49]]
-; AVX512-NEXT:    [[TMP51:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP50]], i32 0
-; AVX512-NEXT:    [[TMP52:%.*]] = bitcast i32 addrspace(1)* [[TMP51]] to <8 x i32> addrspace(1)*
-; AVX512-NEXT:    [[WIDE_LOAD20:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP52]], align 4
-; AVX512-NEXT:    [[TMP53:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD20]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX512-NEXT:    [[TMP54:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP49]]
-; AVX512-NEXT:    [[TMP55:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP54]], i32 0
-; AVX512-NEXT:    [[TMP56:%.*]] = bitcast i32 addrspace(1)* [[TMP55]] to <8 x i32> addrspace(1)*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD21:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP56]], i32 4, <8 x i1> [[TMP53]], <8 x i32> poison)
-; AVX512-NEXT:    [[TMP57:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD21]], [[WIDE_LOAD20]]
-; AVX512-NEXT:    [[TMP58:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP49]]
-; AVX512-NEXT:    [[TMP59:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP58]], i32 0
-; AVX512-NEXT:    [[TMP60:%.*]] = bitcast i32 addrspace(1)* [[TMP59]] to <8 x i32> addrspace(1)*
-; AVX512-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP57]], <8 x i32> addrspace(1)* [[TMP60]], i32 4, <8 x i1> [[TMP53]])
-; AVX512-NEXT:    [[INDEX_NEXT22]] = add nuw i64 [[INDEX19]], 8
-; AVX512-NEXT:    [[TMP61:%.*]] = icmp eq i64 [[INDEX_NEXT22]], 10000
-; AVX512-NEXT:    br i1 [[TMP61]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; AVX512-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; AVX512-NEXT:    [[TMP51:%.*]] = add i64 [[OFFSET_IDX]], 0
+; AVX512-NEXT:    [[TMP52:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP51]]
+; AVX512-NEXT:    [[TMP53:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP52]], i32 0
+; AVX512-NEXT:    [[TMP54:%.*]] = bitcast i32 addrspace(1)* [[TMP53]] to <8 x i32> addrspace(1)*
+; AVX512-NEXT:    [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP54]], align 4
+; AVX512-NEXT:    [[TMP55:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP56:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP51]]
+; AVX512-NEXT:    [[TMP57:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP56]], i32 0
+; AVX512-NEXT:    [[TMP58:%.*]] = bitcast i32 addrspace(1)* [[TMP57]] to <8 x i32> addrspace(1)*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP58]], i32 4, <8 x i1> [[TMP55]], <8 x i32> poison)
+; AVX512-NEXT:    [[TMP59:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD14]], [[WIDE_LOAD13]]
+; AVX512-NEXT:    [[TMP60:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP51]]
+; AVX512-NEXT:    [[TMP61:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP60]], i32 0
+; AVX512-NEXT:    [[TMP62:%.*]] = bitcast i32 addrspace(1)* [[TMP61]] to <8 x i32> addrspace(1)*
+; AVX512-NEXT:    call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP59]], <8 x i32> addrspace(1)* [[TMP62]], i32 4, <8 x i1> [[TMP55]])
+; AVX512-NEXT:    [[INDEX_NEXT15]] = add nuw i64 [[OFFSET_IDX]], 8
+; AVX512-NEXT:    [[TMP63:%.*]] = icmp eq i64 [[INDEX_NEXT15]], 10000
+; AVX512-NEXT:    br i1 [[TMP63]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; AVX512:       vec.epilog.middle.block:
-; AVX512-NEXT:    [[CMP_N18:%.*]] = icmp eq i64 10000, 10000
-; AVX512-NEXT:    br i1 [[CMP_N18]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
+; AVX512-NEXT:    [[CMP_N11:%.*]] = icmp eq i64 10000, 10000
+; AVX512-NEXT:    br i1 [[CMP_N11]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
 ; AVX512:       vec.epilog.scalar.ph:
 ; AVX512-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
 ; AVX512-NEXT:    br label [[FOR_BODY:%.*]]
 ; AVX512:       for.body:
 ; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
 ; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV]]
-; AVX512-NEXT:    [[TMP62:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4
-; AVX512-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP62]], 100
+; AVX512-NEXT:    [[TMP64:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4
+; AVX512-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP64]], 100
 ; AVX512-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; AVX512:       if.then:
 ; AVX512-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV]]
-; AVX512-NEXT:    [[TMP63:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3]], align 4
-; AVX512-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP63]], [[TMP62]]
+; AVX512-NEXT:    [[TMP65:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3]], align 4
+; AVX512-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP65]], [[TMP64]]
 ; AVX512-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV]]
 ; AVX512-NEXT:    store i32 [[ADD]], i32 addrspace(1)* [[ARRAYIDX7]], align 4
 ; AVX512-NEXT:    br label [[FOR_INC]]
 ; AVX512:       for.inc:
 ; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; AVX512-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
-; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; AVX512:       for.end.loopexit:
 ; AVX512-NEXT:    br label [[FOR_END]]
 ; AVX512:       for.end:
@@ -714,25 +714,25 @@ define void @foo2(float* nocapture %A, float* nocapture readonly %B, i32* nocapt
 ; AVX1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; AVX1:       vector.body:
 ; AVX1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; AVX1-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; AVX1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]]
-; AVX1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; AVX1-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <8 x i32>*
-; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 4
-; AVX1-NEXT:    [[TMP4:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX1-NEXT:    [[TMP5:%.*]] = getelementptr float, float* [[B]], i64 [[TMP0]]
-; AVX1-NEXT:    [[TMP6:%.*]] = getelementptr float, float* [[TMP5]], i32 0
-; AVX1-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <8 x float>*
-; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP7]], i32 4, <8 x i1> [[TMP4]], <8 x float> poison)
-; AVX1-NEXT:    [[TMP8:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float>
-; AVX1-NEXT:    [[TMP9:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD]], [[TMP8]]
-; AVX1-NEXT:    [[TMP10:%.*]] = getelementptr float, float* [[A]], i64 [[TMP0]]
-; AVX1-NEXT:    [[TMP11:%.*]] = getelementptr float, float* [[TMP10]], i32 0
-; AVX1-NEXT:    [[TMP12:%.*]] = bitcast float* [[TMP11]] to <8 x float>*
-; AVX1-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP9]], <8 x float>* [[TMP12]], i32 4, <8 x i1> [[TMP4]])
+; AVX1-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; AVX1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]]
+; AVX1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0
+; AVX1-NEXT:    [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>*
+; AVX1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP5]], align 4
+; AVX1-NEXT:    [[TMP6:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX1-NEXT:    [[TMP7:%.*]] = getelementptr float, float* [[B]], i64 [[TMP2]]
+; AVX1-NEXT:    [[TMP8:%.*]] = getelementptr float, float* [[TMP7]], i32 0
+; AVX1-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP8]] to <8 x float>*
+; AVX1-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP9]], i32 4, <8 x i1> [[TMP6]], <8 x float> poison)
+; AVX1-NEXT:    [[TMP10:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float>
+; AVX1-NEXT:    [[TMP11:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD]], [[TMP10]]
+; AVX1-NEXT:    [[TMP12:%.*]] = getelementptr float, float* [[A]], i64 [[TMP2]]
+; AVX1-NEXT:    [[TMP13:%.*]] = getelementptr float, float* [[TMP12]], i32 0
+; AVX1-NEXT:    [[TMP14:%.*]] = bitcast float* [[TMP13]] to <8 x float>*
+; AVX1-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP11]], <8 x float>* [[TMP14]], i32 4, <8 x i1> [[TMP6]])
 ; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; AVX1-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
-; AVX1-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
+; AVX1-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
+; AVX1-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; AVX1:       middle.block:
 ; AVX1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 10000
 ; AVX1-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -742,21 +742,21 @@ define void @foo2(float* nocapture %A, float* nocapture readonly %B, i32* nocapt
 ; AVX1:       for.body:
 ; AVX1-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
 ; AVX1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
-; AVX1-NEXT:    [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; AVX1-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP14]], 100
+; AVX1-NEXT:    [[TMP16:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX1-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP16]], 100
 ; AVX1-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; AVX1:       if.then:
 ; AVX1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
-; AVX1-NEXT:    [[TMP15:%.*]] = load float, float* [[ARRAYIDX3]], align 4
-; AVX1-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP14]] to float
-; AVX1-NEXT:    [[ADD:%.*]] = fadd float [[TMP15]], [[CONV]]
+; AVX1-NEXT:    [[TMP17:%.*]] = load float, float* [[ARRAYIDX3]], align 4
+; AVX1-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP16]] to float
+; AVX1-NEXT:    [[ADD:%.*]] = fadd float [[TMP17]], [[CONV]]
 ; AVX1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
 ; AVX1-NEXT:    store float [[ADD]], float* [[ARRAYIDX7]], align 4
 ; AVX1-NEXT:    br label [[FOR_INC]]
 ; AVX1:       for.inc:
 ; AVX1-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; AVX1-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
-; AVX1-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
+; AVX1-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; AVX1:       for.end:
 ; AVX1-NEXT:    ret void
 ;
@@ -777,73 +777,73 @@ define void @foo2(float* nocapture %A, float* nocapture readonly %B, i32* nocapt
 ; AVX2-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; AVX2:       vector.body:
 ; AVX2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; AVX2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; AVX2-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 8
-; AVX2-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 16
-; AVX2-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 24
-; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]]
-; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP1]]
+; AVX2-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; AVX2-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 8
+; AVX2-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 16
+; AVX2-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 24
 ; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]]
 ; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]]
-; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
-; AVX2-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>*
-; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4
-; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 8
+; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP4]]
+; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP5]]
+; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0
 ; AVX2-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>*
-; AVX2-NEXT:    [[WIDE_LOAD12:%.*]] = load <8 x i32>, <8 x i32>* [[TMP11]], align 4
-; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 16
+; AVX2-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP11]], align 4
+; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 8
 ; AVX2-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>*
-; AVX2-NEXT:    [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4
-; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 24
+; AVX2-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4
+; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 16
 ; AVX2-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <8 x i32>*
-; AVX2-NEXT:    [[WIDE_LOAD14:%.*]] = load <8 x i32>, <8 x i32>* [[TMP15]], align 4
-; AVX2-NEXT:    [[TMP16:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX2-NEXT:    [[TMP17:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX2-NEXT:    [[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX2-NEXT:    [[TMP19:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD14]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr float, float* [[B]], i64 [[TMP0]]
-; AVX2-NEXT:    [[TMP21:%.*]] = getelementptr float, float* [[B]], i64 [[TMP1]]
+; AVX2-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i32>, <8 x i32>* [[TMP15]], align 4
+; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 24
+; AVX2-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <8 x i32>*
+; AVX2-NEXT:    [[WIDE_LOAD7:%.*]] = load <8 x i32>, <8 x i32>* [[TMP17]], align 4
+; AVX2-NEXT:    [[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP19:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD5]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP20:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD6]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX2-NEXT:    [[TMP21:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD7]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
 ; AVX2-NEXT:    [[TMP22:%.*]] = getelementptr float, float* [[B]], i64 [[TMP2]]
 ; AVX2-NEXT:    [[TMP23:%.*]] = getelementptr float, float* [[B]], i64 [[TMP3]]
-; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr float, float* [[TMP20]], i32 0
-; AVX2-NEXT:    [[TMP25:%.*]] = bitcast float* [[TMP24]] to <8 x float>*
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP25]], i32 4, <8 x i1> [[TMP16]], <8 x float> poison)
-; AVX2-NEXT:    [[TMP26:%.*]] = getelementptr float, float* [[TMP20]], i32 8
+; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr float, float* [[B]], i64 [[TMP4]]
+; AVX2-NEXT:    [[TMP25:%.*]] = getelementptr float, float* [[B]], i64 [[TMP5]]
+; AVX2-NEXT:    [[TMP26:%.*]] = getelementptr float, float* [[TMP22]], i32 0
 ; AVX2-NEXT:    [[TMP27:%.*]] = bitcast float* [[TMP26]] to <8 x float>*
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP27]], i32 4, <8 x i1> [[TMP17]], <8 x float> poison)
-; AVX2-NEXT:    [[TMP28:%.*]] = getelementptr float, float* [[TMP20]], i32 16
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP27]], i32 4, <8 x i1> [[TMP18]], <8 x float> poison)
+; AVX2-NEXT:    [[TMP28:%.*]] = getelementptr float, float* [[TMP22]], i32 8
 ; AVX2-NEXT:    [[TMP29:%.*]] = bitcast float* [[TMP28]] to <8 x float>*
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD16:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP29]], i32 4, <8 x i1> [[TMP18]], <8 x float> poison)
-; AVX2-NEXT:    [[TMP30:%.*]] = getelementptr float, float* [[TMP20]], i32 24
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP29]], i32 4, <8 x i1> [[TMP19]], <8 x float> poison)
+; AVX2-NEXT:    [[TMP30:%.*]] = getelementptr float, float* [[TMP22]], i32 16
 ; AVX2-NEXT:    [[TMP31:%.*]] = bitcast float* [[TMP30]] to <8 x float>*
-; AVX2-NEXT:    [[WIDE_MASKED_LOAD17:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP31]], i32 4, <8 x i1> [[TMP19]], <8 x float> poison)
-; AVX2-NEXT:    [[TMP32:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float>
-; AVX2-NEXT:    [[TMP33:%.*]] = sitofp <8 x i32> [[WIDE_LOAD12]] to <8 x float>
-; AVX2-NEXT:    [[TMP34:%.*]] = sitofp <8 x i32> [[WIDE_LOAD13]] to <8 x float>
-; AVX2-NEXT:    [[TMP35:%.*]] = sitofp <8 x i32> [[WIDE_LOAD14]] to <8 x float>
-; AVX2-NEXT:    [[TMP36:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD]], [[TMP32]]
-; AVX2-NEXT:    [[TMP37:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD15]], [[TMP33]]
-; AVX2-NEXT:    [[TMP38:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD16]], [[TMP34]]
-; AVX2-NEXT:    [[TMP39:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD17]], [[TMP35]]
-; AVX2-NEXT:    [[TMP40:%.*]] = getelementptr float, float* [[A]], i64 [[TMP0]]
-; AVX2-NEXT:    [[TMP41:%.*]] = getelementptr float, float* [[A]], i64 [[TMP1]]
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP31]], i32 4, <8 x i1> [[TMP20]], <8 x float> poison)
+; AVX2-NEXT:    [[TMP32:%.*]] = getelementptr float, float* [[TMP22]], i32 24
+; AVX2-NEXT:    [[TMP33:%.*]] = bitcast float* [[TMP32]] to <8 x float>*
+; AVX2-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP33]], i32 4, <8 x i1> [[TMP21]], <8 x float> poison)
+; AVX2-NEXT:    [[TMP34:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float>
+; AVX2-NEXT:    [[TMP35:%.*]] = sitofp <8 x i32> [[WIDE_LOAD5]] to <8 x float>
+; AVX2-NEXT:    [[TMP36:%.*]] = sitofp <8 x i32> [[WIDE_LOAD6]] to <8 x float>
+; AVX2-NEXT:    [[TMP37:%.*]] = sitofp <8 x i32> [[WIDE_LOAD7]] to <8 x float>
+; AVX2-NEXT:    [[TMP38:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD]], [[TMP34]]
+; AVX2-NEXT:    [[TMP39:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD8]], [[TMP35]]
+; AVX2-NEXT:    [[TMP40:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD9]], [[TMP36]]
+; AVX2-NEXT:    [[TMP41:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD10]], [[TMP37]]
 ; AVX2-NEXT:    [[TMP42:%.*]] = getelementptr float, float* [[A]], i64 [[TMP2]]
 ; AVX2-NEXT:    [[TMP43:%.*]] = getelementptr float, float* [[A]], i64 [[TMP3]]
-; AVX2-NEXT:    [[TMP44:%.*]] = getelementptr float, float* [[TMP40]], i32 0
-; AVX2-NEXT:    [[TMP45:%.*]] = bitcast float* [[TMP44]] to <8 x float>*
-; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP36]], <8 x float>* [[TMP45]], i32 4, <8 x i1> [[TMP16]])
-; AVX2-NEXT:    [[TMP46:%.*]] = getelementptr float, float* [[TMP40]], i32 8
+; AVX2-NEXT:    [[TMP44:%.*]] = getelementptr float, float* [[A]], i64 [[TMP4]]
+; AVX2-NEXT:    [[TMP45:%.*]] = getelementptr float, float* [[A]], i64 [[TMP5]]
+; AVX2-NEXT:    [[TMP46:%.*]] = getelementptr float, float* [[TMP42]], i32 0
 ; AVX2-NEXT:    [[TMP47:%.*]] = bitcast float* [[TMP46]] to <8 x float>*
-; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP37]], <8 x float>* [[TMP47]], i32 4, <8 x i1> [[TMP17]])
-; AVX2-NEXT:    [[TMP48:%.*]] = getelementptr float, float* [[TMP40]], i32 16
+; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP38]], <8 x float>* [[TMP47]], i32 4, <8 x i1> [[TMP18]])
+; AVX2-NEXT:    [[TMP48:%.*]] = getelementptr float, float* [[TMP42]], i32 8
 ; AVX2-NEXT:    [[TMP49:%.*]] = bitcast float* [[TMP48]] to <8 x float>*
-; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP38]], <8 x float>* [[TMP49]], i32 4, <8 x i1> [[TMP18]])
-; AVX2-NEXT:    [[TMP50:%.*]] = getelementptr float, float* [[TMP40]], i32 24
+; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP39]], <8 x float>* [[TMP49]], i32 4, <8 x i1> [[TMP19]])
+; AVX2-NEXT:    [[TMP50:%.*]] = getelementptr float, float* [[TMP42]], i32 16
 ; AVX2-NEXT:    [[TMP51:%.*]] = bitcast float* [[TMP50]] to <8 x float>*
-; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP39]], <8 x float>* [[TMP51]], i32 4, <8 x i1> [[TMP19]])
+; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP40]], <8 x float>* [[TMP51]], i32 4, <8 x i1> [[TMP20]])
+; AVX2-NEXT:    [[TMP52:%.*]] = getelementptr float, float* [[TMP42]], i32 24
+; AVX2-NEXT:    [[TMP53:%.*]] = bitcast float* [[TMP52]] to <8 x float>*
+; AVX2-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP41]], <8 x float>* [[TMP53]], i32 4, <8 x i1> [[TMP21]])
 ; AVX2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; AVX2-NEXT:    [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
-; AVX2-NEXT:    br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
+; AVX2-NEXT:    [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
+; AVX2-NEXT:    br i1 [[TMP54]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; AVX2:       middle.block:
 ; AVX2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 9984
 ; AVX2-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -853,21 +853,21 @@ define void @foo2(float* nocapture %A, float* nocapture readonly %B, i32* nocapt
 ; AVX2:       for.body:
 ; AVX2-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
 ; AVX2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
-; AVX2-NEXT:    [[TMP53:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; AVX2-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP53]], 100
+; AVX2-NEXT:    [[TMP55:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX2-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP55]], 100
 ; AVX2-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; AVX2:       if.then:
 ; AVX2-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
-; AVX2-NEXT:    [[TMP54:%.*]] = load float, float* [[ARRAYIDX3]], align 4
-; AVX2-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP53]] to float
-; AVX2-NEXT:    [[ADD:%.*]] = fadd float [[TMP54]], [[CONV]]
+; AVX2-NEXT:    [[TMP56:%.*]] = load float, float* [[ARRAYIDX3]], align 4
+; AVX2-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP55]] to float
+; AVX2-NEXT:    [[ADD:%.*]] = fadd float [[TMP56]], [[CONV]]
 ; AVX2-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
 ; AVX2-NEXT:    store float [[ADD]], float* [[ARRAYIDX7]], align 4
 ; AVX2-NEXT:    br label [[FOR_INC]]
 ; AVX2:       for.inc:
 ; AVX2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; AVX2-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
-; AVX2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
+; AVX2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; AVX2:       for.end:
 ; AVX2-NEXT:    ret void
 ;
@@ -890,73 +890,73 @@ define void @foo2(float* nocapture %A, float* nocapture readonly %B, i32* nocapt
 ; AVX512-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; AVX512:       vector.body:
 ; AVX512-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; AVX512-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; AVX512-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 16
-; AVX512-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 32
-; AVX512-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 48
-; AVX512-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]]
-; AVX512-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP1]]
+; AVX512-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
+; AVX512-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 16
+; AVX512-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 32
+; AVX512-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 48
 ; AVX512-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]]
 ; AVX512-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]]
-; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
-; AVX512-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <16 x i32>*
-; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP9]], align 4
-; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 16
+; AVX512-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP4]]
+; AVX512-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP5]]
+; AVX512-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0
 ; AVX512-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <16 x i32>*
-; AVX512-NEXT:    [[WIDE_LOAD12:%.*]] = load <16 x i32>, <16 x i32>* [[TMP11]], align 4
-; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 32
+; AVX512-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP11]], align 4
+; AVX512-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 16
 ; AVX512-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <16 x i32>*
-; AVX512-NEXT:    [[WIDE_LOAD13:%.*]] = load <16 x i32>, <16 x i32>* [[TMP13]], align 4
-; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 48
+; AVX512-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i32>, <16 x i32>* [[TMP13]], align 4
+; AVX512-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 32
 ; AVX512-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <16 x i32>*
-; AVX512-NEXT:    [[WIDE_LOAD14:%.*]] = load <16 x i32>, <16 x i32>* [[TMP15]], align 4
-; AVX512-NEXT:    [[TMP16:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX512-NEXT:    [[TMP17:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD12]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX512-NEXT:    [[TMP18:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD13]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX512-NEXT:    [[TMP19:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD14]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX512-NEXT:    [[TMP20:%.*]] = getelementptr float, float* [[B]], i64 [[TMP0]]
-; AVX512-NEXT:    [[TMP21:%.*]] = getelementptr float, float* [[B]], i64 [[TMP1]]
+; AVX512-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i32>, <16 x i32>* [[TMP15]], align 4
+; AVX512-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 48
+; AVX512-NEXT:    [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <16 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD7:%.*]] = load <16 x i32>, <16 x i32>* [[TMP17]], align 4
+; AVX512-NEXT:    [[TMP18:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP19:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD5]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP20:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD6]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP21:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD7]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
 ; AVX512-NEXT:    [[TMP22:%.*]] = getelementptr float, float* [[B]], i64 [[TMP2]]
 ; AVX512-NEXT:    [[TMP23:%.*]] = getelementptr float, float* [[B]], i64 [[TMP3]]
-; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr float, float* [[TMP20]], i32 0
-; AVX512-NEXT:    [[TMP25:%.*]] = bitcast float* [[TMP24]] to <16 x float>*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP25]], i32 4, <16 x i1> [[TMP16]], <16 x float> poison)
-; AVX512-NEXT:    [[TMP26:%.*]] = getelementptr float, float* [[TMP20]], i32 16
+; AVX512-NEXT:    [[TMP24:%.*]] = getelementptr float, float* [[B]], i64 [[TMP4]]
+; AVX512-NEXT:    [[TMP25:%.*]] = getelementptr float, float* [[B]], i64 [[TMP5]]
+; AVX512-NEXT:    [[TMP26:%.*]] = getelementptr float, float* [[TMP22]], i32 0
 ; AVX512-NEXT:    [[TMP27:%.*]] = bitcast float* [[TMP26]] to <16 x float>*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD15:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP27]], i32 4, <16 x i1> [[TMP17]], <16 x float> poison)
-; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr float, float* [[TMP20]], i32 32
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP27]], i32 4, <16 x i1> [[TMP18]], <16 x float> poison)
+; AVX512-NEXT:    [[TMP28:%.*]] = getelementptr float, float* [[TMP22]], i32 16
 ; AVX512-NEXT:    [[TMP29:%.*]] = bitcast float* [[TMP28]] to <16 x float>*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD16:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP29]], i32 4, <16 x i1> [[TMP18]], <16 x float> poison)
-; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr float, float* [[TMP20]], i32 48
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD8:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP29]], i32 4, <16 x i1> [[TMP19]], <16 x float> poison)
+; AVX512-NEXT:    [[TMP30:%.*]] = getelementptr float, float* [[TMP22]], i32 32
 ; AVX512-NEXT:    [[TMP31:%.*]] = bitcast float* [[TMP30]] to <16 x float>*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD17:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP31]], i32 4, <16 x i1> [[TMP19]], <16 x float> poison)
-; AVX512-NEXT:    [[TMP32:%.*]] = sitofp <16 x i32> [[WIDE_LOAD]] to <16 x float>
-; AVX512-NEXT:    [[TMP33:%.*]] = sitofp <16 x i32> [[WIDE_LOAD12]] to <16 x float>
-; AVX512-NEXT:    [[TMP34:%.*]] = sitofp <16 x i32> [[WIDE_LOAD13]] to <16 x float>
-; AVX512-NEXT:    [[TMP35:%.*]] = sitofp <16 x i32> [[WIDE_LOAD14]] to <16 x float>
-; AVX512-NEXT:    [[TMP36:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD]], [[TMP32]]
-; AVX512-NEXT:    [[TMP37:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD15]], [[TMP33]]
-; AVX512-NEXT:    [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD16]], [[TMP34]]
-; AVX512-NEXT:    [[TMP39:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD17]], [[TMP35]]
-; AVX512-NEXT:    [[TMP40:%.*]] = getelementptr float, float* [[A]], i64 [[TMP0]]
-; AVX512-NEXT:    [[TMP41:%.*]] = getelementptr float, float* [[A]], i64 [[TMP1]]
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP31]], i32 4, <16 x i1> [[TMP20]], <16 x float> poison)
+; AVX512-NEXT:    [[TMP32:%.*]] = getelementptr float, float* [[TMP22]], i32 48
+; AVX512-NEXT:    [[TMP33:%.*]] = bitcast float* [[TMP32]] to <16 x float>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD10:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP33]], i32 4, <16 x i1> [[TMP21]], <16 x float> poison)
+; AVX512-NEXT:    [[TMP34:%.*]] = sitofp <16 x i32> [[WIDE_LOAD]] to <16 x float>
+; AVX512-NEXT:    [[TMP35:%.*]] = sitofp <16 x i32> [[WIDE_LOAD5]] to <16 x float>
+; AVX512-NEXT:    [[TMP36:%.*]] = sitofp <16 x i32> [[WIDE_LOAD6]] to <16 x float>
+; AVX512-NEXT:    [[TMP37:%.*]] = sitofp <16 x i32> [[WIDE_LOAD7]] to <16 x float>
+; AVX512-NEXT:    [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD]], [[TMP34]]
+; AVX512-NEXT:    [[TMP39:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD8]], [[TMP35]]
+; AVX512-NEXT:    [[TMP40:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD9]], [[TMP36]]
+; AVX512-NEXT:    [[TMP41:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD10]], [[TMP37]]
 ; AVX512-NEXT:    [[TMP42:%.*]] = getelementptr float, float* [[A]], i64 [[TMP2]]
 ; AVX512-NEXT:    [[TMP43:%.*]] = getelementptr float, float* [[A]], i64 [[TMP3]]
-; AVX512-NEXT:    [[TMP44:%.*]] = getelementptr float, float* [[TMP40]], i32 0
-; AVX512-NEXT:    [[TMP45:%.*]] = bitcast float* [[TMP44]] to <16 x float>*
-; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP36]], <16 x float>* [[TMP45]], i32 4, <16 x i1> [[TMP16]])
-; AVX512-NEXT:    [[TMP46:%.*]] = getelementptr float, float* [[TMP40]], i32 16
+; AVX512-NEXT:    [[TMP44:%.*]] = getelementptr float, float* [[A]], i64 [[TMP4]]
+; AVX512-NEXT:    [[TMP45:%.*]] = getelementptr float, float* [[A]], i64 [[TMP5]]
+; AVX512-NEXT:    [[TMP46:%.*]] = getelementptr float, float* [[TMP42]], i32 0
 ; AVX512-NEXT:    [[TMP47:%.*]] = bitcast float* [[TMP46]] to <16 x float>*
-; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP37]], <16 x float>* [[TMP47]], i32 4, <16 x i1> [[TMP17]])
-; AVX512-NEXT:    [[TMP48:%.*]] = getelementptr float, float* [[TMP40]], i32 32
+; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP38]], <16 x float>* [[TMP47]], i32 4, <16 x i1> [[TMP18]])
+; AVX512-NEXT:    [[TMP48:%.*]] = getelementptr float, float* [[TMP42]], i32 16
 ; AVX512-NEXT:    [[TMP49:%.*]] = bitcast float* [[TMP48]] to <16 x float>*
-; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP38]], <16 x float>* [[TMP49]], i32 4, <16 x i1> [[TMP18]])
-; AVX512-NEXT:    [[TMP50:%.*]] = getelementptr float, float* [[TMP40]], i32 48
+; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP39]], <16 x float>* [[TMP49]], i32 4, <16 x i1> [[TMP19]])
+; AVX512-NEXT:    [[TMP50:%.*]] = getelementptr float, float* [[TMP42]], i32 32
 ; AVX512-NEXT:    [[TMP51:%.*]] = bitcast float* [[TMP50]] to <16 x float>*
-; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP39]], <16 x float>* [[TMP51]], i32 4, <16 x i1> [[TMP19]])
+; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP40]], <16 x float>* [[TMP51]], i32 4, <16 x i1> [[TMP20]])
+; AVX512-NEXT:    [[TMP52:%.*]] = getelementptr float, float* [[TMP42]], i32 48
+; AVX512-NEXT:    [[TMP53:%.*]] = bitcast float* [[TMP52]] to <16 x float>*
+; AVX512-NEXT:    call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP41]], <16 x float>* [[TMP53]], i32 4, <16 x i1> [[TMP21]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
-; AVX512-NEXT:    [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
-; AVX512-NEXT:    br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
+; AVX512-NEXT:    [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
+; AVX512-NEXT:    br i1 [[TMP54]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; AVX512:       middle.block:
 ; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 9984
 ; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
@@ -966,50 +966,50 @@ define void @foo2(float* nocapture %A, float* nocapture readonly %B, i32* nocapt
 ; AVX512-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; AVX512-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; AVX512:       vec.epilog.vector.body:
-; AVX512-NEXT:    [[INDEX19:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT22:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; AVX512-NEXT:    [[TMP53:%.*]] = add i64 [[INDEX19]], 0
-; AVX512-NEXT:    [[TMP54:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP53]]
-; AVX512-NEXT:    [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TMP54]], i32 0
-; AVX512-NEXT:    [[TMP56:%.*]] = bitcast i32* [[TMP55]] to <8 x i32>*
-; AVX512-NEXT:    [[WIDE_LOAD20:%.*]] = load <8 x i32>, <8 x i32>* [[TMP56]], align 4
-; AVX512-NEXT:    [[TMP57:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD20]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
-; AVX512-NEXT:    [[TMP58:%.*]] = getelementptr float, float* [[B]], i64 [[TMP53]]
-; AVX512-NEXT:    [[TMP59:%.*]] = getelementptr float, float* [[TMP58]], i32 0
-; AVX512-NEXT:    [[TMP60:%.*]] = bitcast float* [[TMP59]] to <8 x float>*
-; AVX512-NEXT:    [[WIDE_MASKED_LOAD21:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP60]], i32 4, <8 x i1> [[TMP57]], <8 x float> poison)
-; AVX512-NEXT:    [[TMP61:%.*]] = sitofp <8 x i32> [[WIDE_LOAD20]] to <8 x float>
-; AVX512-NEXT:    [[TMP62:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD21]], [[TMP61]]
-; AVX512-NEXT:    [[TMP63:%.*]] = getelementptr float, float* [[A]], i64 [[TMP53]]
-; AVX512-NEXT:    [[TMP64:%.*]] = getelementptr float, float* [[TMP63]], i32 0
-; AVX512-NEXT:    [[TMP65:%.*]] = bitcast float* [[TMP64]] to <8 x float>*
-; AVX512-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP62]], <8 x float>* [[TMP65]], i32 4, <8 x i1> [[TMP57]])
-; AVX512-NEXT:    [[INDEX_NEXT22]] = add nuw i64 [[INDEX19]], 8
-; AVX512-NEXT:    [[TMP66:%.*]] = icmp eq i64 [[INDEX_NEXT22]], 10000
-; AVX512-NEXT:    br i1 [[TMP66]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]]
+; AVX512-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; AVX512-NEXT:    [[TMP55:%.*]] = add i64 [[OFFSET_IDX]], 0
+; AVX512-NEXT:    [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP55]]
+; AVX512-NEXT:    [[TMP57:%.*]] = getelementptr inbounds i32, i32* [[TMP56]], i32 0
+; AVX512-NEXT:    [[TMP58:%.*]] = bitcast i32* [[TMP57]] to <8 x i32>*
+; AVX512-NEXT:    [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32>* [[TMP58]], align 4
+; AVX512-NEXT:    [[TMP59:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], <i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100, i32 100>
+; AVX512-NEXT:    [[TMP60:%.*]] = getelementptr float, float* [[B]], i64 [[TMP55]]
+; AVX512-NEXT:    [[TMP61:%.*]] = getelementptr float, float* [[TMP60]], i32 0
+; AVX512-NEXT:    [[TMP62:%.*]] = bitcast float* [[TMP61]] to <8 x float>*
+; AVX512-NEXT:    [[WIDE_MASKED_LOAD14:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP62]], i32 4, <8 x i1> [[TMP59]], <8 x float> poison)
+; AVX512-NEXT:    [[TMP63:%.*]] = sitofp <8 x i32> [[WIDE_LOAD13]] to <8 x float>
+; AVX512-NEXT:    [[TMP64:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD14]], [[TMP63]]
+; AVX512-NEXT:    [[TMP65:%.*]] = getelementptr float, float* [[A]], i64 [[TMP55]]
+; AVX512-NEXT:    [[TMP66:%.*]] = getelementptr float, float* [[TMP65]], i32 0
+; AVX512-NEXT:    [[TMP67:%.*]] = bitcast float* [[TMP66]] to <8 x float>*
+; AVX512-NEXT:    call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP64]], <8 x float>* [[TMP67]], i32 4, <8 x i1> [[TMP59]])
+; AVX512-NEXT:    [[INDEX_NEXT15]] = add nuw i64 [[OFFSET_IDX]], 8
+; AVX512-NEXT:    [[TMP68:%.*]] = icmp eq i64 [[INDEX_NEXT15]], 10000
+; AVX512-NEXT:    br i1 [[TMP68]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; AVX512:       vec.epilog.middle.block:
-; AVX512-NEXT:    [[CMP_N18:%.*]] = icmp eq i64 10000, 10000
-; AVX512-NEXT:    br i1 [[CMP_N18]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
+; AVX512-NEXT:    [[CMP_N11:%.*]] = icmp eq i64 10000, 10000
+; AVX512-NEXT:    br i1 [[CMP_N11]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
 ; AVX512:       vec.epilog.scalar.ph:
 ; AVX512-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
 ; AVX512-NEXT:    br label [[FOR_BODY:%.*]]
 ; AVX512:       for.body:
 ; AVX512-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
 ; AVX512-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
-; AVX512-NEXT:    [[TMP67:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; AVX512-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP67]], 100
+; AVX512-NEXT:    [[TMP69:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX512-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP69]], 100
 ; AVX512-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; AVX512:       if.then:
 ; AVX512-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
-; AVX512-NEXT:    [[TMP68:%.*]] = load float, float* [[ARRAYIDX3]], align 4
-; AVX512-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP67]] to float
-; AVX512-NEXT:    [[ADD:%.*]] = fadd float [[TMP68]], [[CONV]]
+; AVX512-NEXT:    [[TMP70:%.*]] = load float, float* [[ARRAYIDX3]], align 4
+; AVX512-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP69]] to float
+; AVX512-NEXT:    [[ADD:%.*]] = fadd float [[TMP70]], [[CONV]]
 ; AVX512-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
 ; AVX512-NEXT:    store float [[ADD]], float* [[ARRAYIDX7]], align 4
 ; AVX512-NEXT:    br label [[FOR_INC]]
 ; AVX512:       for.inc:
 ; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; AVX512-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
-; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
+; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; AVX512:       for.end.loopexit:
 ; AVX512-NEXT:    br label [[FOR_END]]
 ; AVX512:       for.end:
@@ -1146,7 +1146,7 @@ define void @foo3(double* nocapture %A, double* nocapture readonly %B, i32* noca
 ; AVX-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP39]], <4 x double>* [[TMP51]], i32 8, <4 x i1> [[TMP19]]), !alias.scope !12, !noalias !14
 ; AVX-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; AVX-NEXT:    [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
-; AVX-NEXT:    br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP39:![0-9]+]]
+; AVX-NEXT:    br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; AVX:       middle.block:
 ; AVX-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 10000
 ; AVX-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -1170,7 +1170,7 @@ define void @foo3(double* nocapture %A, double* nocapture readonly %B, i32* noca
 ; AVX:       for.inc:
 ; AVX-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; AVX-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
-; AVX-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]]
+; AVX-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; AVX:       for.end:
 ; AVX-NEXT:    ret void
 ;
@@ -1265,7 +1265,7 @@ define void @foo3(double* nocapture %A, double* nocapture readonly %B, i32* noca
 ; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP39]], <8 x double>* [[TMP51]], i32 8, <8 x i1> [[TMP19]]), !alias.scope !16, !noalias !18
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; AVX512-NEXT:    [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984
-; AVX512-NEXT:    br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]]
+; AVX512-NEXT:    br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; AVX512:       middle.block:
 ; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, 9984
 ; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -1289,7 +1289,7 @@ define void @foo3(double* nocapture %A, double* nocapture readonly %B, i32* noca
 ; AVX512:       for.inc:
 ; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; AVX512-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
-; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]]
+; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; AVX512:       for.end:
 ; AVX512-NEXT:    ret void
 ;
@@ -1397,7 +1397,7 @@ define void @foo4(double* nocapture %A, double* nocapture readonly %B, i32* noca
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; AVX512-NEXT:    [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], <i64 128, i64 128, i64 128, i64 128, i64 128, i64 128, i64 128, i64 128>
 ; AVX512-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 624
-; AVX512-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP53:![0-9]+]]
+; AVX512-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
 ; AVX512:       middle.block:
 ; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 625, 624
 ; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -1422,7 +1422,7 @@ define void @foo4(double* nocapture %A, double* nocapture readonly %B, i32* noca
 ; AVX512:       for.inc:
 ; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16
 ; AVX512-NEXT:    [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000
-; AVX512-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP54:![0-9]+]]
+; AVX512-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP30:![0-9]+]]
 ; AVX512:       for.end:
 ; AVX512-NEXT:    ret void
 ;
@@ -1676,7 +1676,7 @@ define void @foo6(double* nocapture readonly %in, double* nocapture %out, i32 %s
 ; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE35]], <4 x double>* [[TMP59]], i32 8, <4 x i1> [[REVERSE26]]), !alias.scope !22, !noalias !24
 ; AVX2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; AVX2-NEXT:    [[TMP60:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; AVX2-NEXT:    br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP49:![0-9]+]]
+; AVX2-NEXT:    br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
 ; AVX2:       middle.block:
 ; AVX2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4096, 4096
 ; AVX2-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -1699,7 +1699,7 @@ define void @foo6(double* nocapture readonly %in, double* nocapture %out, i32 %s
 ; AVX2:       for.inc:
 ; AVX2-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
 ; AVX2-NEXT:    [[CMP:%.*]] = icmp eq i64 [[INDVARS_IV]], 0
-; AVX2-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP50:![0-9]+]]
+; AVX2-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; AVX2:       for.end:
 ; AVX2-NEXT:    ret void
 ;
@@ -1819,7 +1819,7 @@ define void @foo6(double* nocapture readonly %in, double* nocapture %out, i32 %s
 ; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE35]], <8 x double>* [[TMP59]], i32 8, <8 x i1> [[REVERSE26]]), !alias.scope !36, !noalias !38
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; AVX512-NEXT:    [[TMP60:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; AVX512-NEXT:    br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP63:![0-9]+]]
+; AVX512-NEXT:    br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP39:![0-9]+]]
 ; AVX512:       middle.block:
 ; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 4096, 4096
 ; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -1842,7 +1842,7 @@ define void @foo6(double* nocapture readonly %in, double* nocapture %out, i32 %s
 ; AVX512:       for.inc:
 ; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
 ; AVX512-NEXT:    [[CMP:%.*]] = icmp eq i64 [[INDVARS_IV]], 0
-; AVX512-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP64:![0-9]+]]
+; AVX512-NEXT:    br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]]
 ; AVX512:       for.end:
 ; AVX512-NEXT:    ret void
 ;
@@ -1974,7 +1974,7 @@ define void @foo7(double* noalias nocapture %out, double** noalias nocapture rea
 ; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]])
 ; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; AVX1-NEXT:    [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; AVX1-NEXT:    br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]]
+; AVX1-NEXT:    br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; AVX1:       middle.block:
 ; AVX1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; AVX1-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -2000,7 +2000,7 @@ define void @foo7(double* noalias nocapture %out, double** noalias nocapture rea
 ; AVX1:       for.inc:
 ; AVX1-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; AVX1-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; AVX1-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
+; AVX1-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; AVX1:       for.end.loopexit:
 ; AVX1-NEXT:    br label [[FOR_END]]
 ; AVX1:       for.end:
@@ -2098,7 +2098,7 @@ define void @foo7(double* noalias nocapture %out, double** noalias nocapture rea
 ; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]])
 ; AVX2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; AVX2-NEXT:    [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; AVX2-NEXT:    br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP51:![0-9]+]]
+; AVX2-NEXT:    br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
 ; AVX2:       middle.block:
 ; AVX2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; AVX2-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -2124,7 +2124,7 @@ define void @foo7(double* noalias nocapture %out, double** noalias nocapture rea
 ; AVX2:       for.inc:
 ; AVX2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; AVX2-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; AVX2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP52:![0-9]+]]
+; AVX2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; AVX2:       for.end.loopexit:
 ; AVX2-NEXT:    br label [[FOR_END]]
 ; AVX2:       for.end:
@@ -2222,7 +2222,7 @@ define void @foo7(double* noalias nocapture %out, double** noalias nocapture rea
 ; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP63]], i32 8, <8 x i1> [[TMP55]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; AVX512-NEXT:    [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; AVX512-NEXT:    br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP65:![0-9]+]]
+; AVX512-NEXT:    br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]]
 ; AVX512:       middle.block:
 ; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -2248,7 +2248,7 @@ define void @foo7(double* noalias nocapture %out, double** noalias nocapture rea
 ; AVX512:       for.inc:
 ; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; AVX512-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP66:![0-9]+]]
+; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
 ; AVX512:       for.end.loopexit:
 ; AVX512-NEXT:    br label [[FOR_END]]
 ; AVX512:       for.end:
@@ -2391,7 +2391,7 @@ define void @foo8(double* noalias nocapture %out, i32 ()** noalias nocapture rea
 ; AVX1-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]])
 ; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; AVX1-NEXT:    [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; AVX1-NEXT:    br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]]
+; AVX1-NEXT:    br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; AVX1:       middle.block:
 ; AVX1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; AVX1-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -2417,7 +2417,7 @@ define void @foo8(double* noalias nocapture %out, i32 ()** noalias nocapture rea
 ; AVX1:       for.inc:
 ; AVX1-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; AVX1-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; AVX1-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP45:![0-9]+]]
+; AVX1-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ; AVX1:       for.end.loopexit:
 ; AVX1-NEXT:    br label [[FOR_END]]
 ; AVX1:       for.end:
@@ -2515,7 +2515,7 @@ define void @foo8(double* noalias nocapture %out, i32 ()** noalias nocapture rea
 ; AVX2-NEXT:    call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]])
 ; AVX2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; AVX2-NEXT:    [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; AVX2-NEXT:    br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP54:![0-9]+]]
+; AVX2-NEXT:    br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
 ; AVX2:       middle.block:
 ; AVX2-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; AVX2-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -2541,7 +2541,7 @@ define void @foo8(double* noalias nocapture %out, i32 ()** noalias nocapture rea
 ; AVX2:       for.inc:
 ; AVX2-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; AVX2-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; AVX2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP55:![0-9]+]]
+; AVX2-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]]
 ; AVX2:       for.end.loopexit:
 ; AVX2-NEXT:    br label [[FOR_END]]
 ; AVX2:       for.end:
@@ -2639,7 +2639,7 @@ define void @foo8(double* noalias nocapture %out, i32 ()** noalias nocapture rea
 ; AVX512-NEXT:    call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> <double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01, double 5.000000e-01>, <8 x double>* [[TMP63]], i32 8, <8 x i1> [[TMP55]])
 ; AVX512-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; AVX512-NEXT:    [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; AVX512-NEXT:    br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP67:![0-9]+]]
+; AVX512-NEXT:    br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]]
 ; AVX512:       middle.block:
 ; AVX512-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; AVX512-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
@@ -2665,7 +2665,7 @@ define void @foo8(double* noalias nocapture %out, i32 ()** noalias nocapture rea
 ; AVX512:       for.inc:
 ; AVX512-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; AVX512-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP68:![0-9]+]]
+; AVX512-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]]
 ; AVX512:       for.end.loopexit:
 ; AVX512-NEXT:    br label [[FOR_END]]
 ; AVX512:       for.end:
diff --git a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll
index 57ddff6..46e84ab 100644
--- a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -loop-vectorize -force-vector-width=4 -enable-epilogue-vectorization -epilogue-vectorization-force-VF=4 -S | FileCheck %s
+; RUN: opt -passes=loop-vectorize -force-vector-width=4 -enable-epilogue-vectorization -epilogue-vectorization-force-VF=4 -S %s | FileCheck %s
 
 ;
 ; Integer reduction with a start value of 5
@@ -44,15 +44,15 @@ define i64 @int_reduction_add(i64* %a, i64 %N) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI7:%.*]] = phi <4 x i64> [ [[TMP7]], [[VEC_EPILOG_PH]] ], [ [[TMP12:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX6]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, i64* [[TMP9]], i32 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i64* [[TMP10]] to <4 x i64>*
 ; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x i64>, <4 x i64>* [[TMP11]], align 4
 ; CHECK-NEXT:    [[TMP12]] = add <4 x i64> [[WIDE_LOAD8]], [[VEC_PHI7]]
-; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[INDEX6]], 4
+; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[OFFSET_IDX]], 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC4]]
 ; CHECK-NEXT:    br i1 [[TMP13]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
@@ -141,16 +141,16 @@ define float @fp_reduction_max(float* noalias %a, i64 %N) {
 ; CHECK-NEXT:    [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x float> [[MINMAX_IDENT_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI7:%.*]] = phi <4 x float> [ [[MINMAX_IDENT_SPLAT]], [[VEC_EPILOG_PH]] ], [ [[TMP13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX6]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP9]], i32 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast float* [[TMP10]] to <4 x float>*
 ; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x float>, <4 x float>* [[TMP11]], align 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = fcmp fast ogt <4 x float> [[VEC_PHI7]], [[WIDE_LOAD8]]
 ; CHECK-NEXT:    [[TMP13]] = select <4 x i1> [[TMP12]], <4 x float> [[VEC_PHI7]], <4 x float> [[WIDE_LOAD8]]
-; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[INDEX6]], 4
+; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[OFFSET_IDX]], 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC4]]
 ; CHECK-NEXT:    br i1 [[TMP14]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
@@ -222,7 +222,7 @@ define i16 @reduction_or_trunc(i16* noalias nocapture %ptr) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    [[TMP8:%.*]] = trunc <4 x i32> [[TMP6]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP9]] = zext <4 x i16> [[TMP8]] to <4 x i32>
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP10:%.*]] = trunc <4 x i32> [[TMP9]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP11:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP10]])
@@ -237,9 +237,9 @@ define i16 @reduction_or_trunc(i16* noalias nocapture %ptr) {
 ; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[INDEX3:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI4:%.*]] = phi <4 x i32> [ [[TMP13]], [[VEC_EPILOG_PH]] ], [ [[TMP23:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[INDEX3]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[OFFSET_IDX]], 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = and <4 x i32> [[VEC_PHI4]], <i32 65535, i32 65535, i32 65535, i32 65535>
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i32 [[TMP14]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i16, i16* [[TMP16]], i32 0
@@ -247,11 +247,11 @@ define i16 @reduction_or_trunc(i16* noalias nocapture %ptr) {
 ; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i16>, <4 x i16>* [[TMP18]], align 2
 ; CHECK-NEXT:    [[TMP19:%.*]] = zext <4 x i16> [[WIDE_LOAD5]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP20:%.*]] = or <4 x i32> [[TMP15]], [[TMP19]]
-; CHECK-NEXT:    [[INDEX_NEXT6]] = add nuw i32 [[INDEX3]], 4
+; CHECK-NEXT:    [[INDEX_NEXT6]] = add nuw i32 [[OFFSET_IDX]], 4
 ; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[INDEX_NEXT6]], 256
 ; CHECK-NEXT:    [[TMP22:%.*]] = trunc <4 x i32> [[TMP20]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP23]] = zext <4 x i16> [[TMP22]] to <4 x i32>
-; CHECK-NEXT:    br i1 [[TMP21]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP21]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
 ; CHECK-NEXT:    [[TMP24:%.*]] = trunc <4 x i32> [[TMP23]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP25:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP24]])
@@ -272,7 +272,7 @@ define i16 @reduction_or_trunc(i16* noalias nocapture %ptr) {
 ; CHECK-NEXT:    [[XOR]] = or i32 [[SUM_02]], [[EXT]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 256
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       for.end.loopexit:
 ; CHECK-NEXT:    [[XOR_LCSSA1:%.*]] = phi i32 [ [[XOR]], [[FOR_BODY]] ], [ [[TMP26]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_END]]
@@ -329,7 +329,7 @@ define float @multiple_fp_rdx(float* %A, i64 %N) {
 ; CHECK-NEXT:    [[TMP5]] = fmul fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP4]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP5]])
@@ -349,19 +349,19 @@ define float @multiple_fp_rdx(float* %A, i64 %N) {
 ; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x float> zeroinitializer, float [[BC_MERGE_RDX3]], i32 0
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[INDEX9:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI10:%.*]] = phi <4 x float> [ [[TMP9]], [[VEC_EPILOG_PH]] ], [ [[TMP16:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI11:%.*]] = phi <4 x float> [ [[TMP10]], [[VEC_EPILOG_PH]] ], [ [[TMP15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX9]], 0
+; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP11]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = bitcast float* [[TMP13]] to <4 x float>*
 ; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <4 x float>, <4 x float>* [[TMP14]], align 4
 ; CHECK-NEXT:    [[TMP15]] = fadd fast <4 x float> [[VEC_PHI11]], [[WIDE_LOAD12]]
 ; CHECK-NEXT:    [[TMP16]] = fmul fast <4 x float> [[VEC_PHI10]], [[WIDE_LOAD12]]
-; CHECK-NEXT:    [[INDEX_NEXT13]] = add nuw i64 [[INDEX9]], 4
+; CHECK-NEXT:    [[INDEX_NEXT13]] = add nuw i64 [[OFFSET_IDX]], 4
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC7]]
-; CHECK-NEXT:    br i1 [[TMP17]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
 ; CHECK-NEXT:    [[TMP18:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP15]])
 ; CHECK-NEXT:    [[TMP19:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP16]])
@@ -382,7 +382,7 @@ define float @multiple_fp_rdx(float* %A, i64 %N) {
 ; CHECK-NEXT:    [[MUL]] = fmul fast float [[PROD]], [[TMP20]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       for.end.loopexit:
 ; CHECK-NEXT:    [[ADD_LCSSA5:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP18]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    [[MUL_LCSSA4:%.*]] = phi float [ [[MUL]], [[FOR_BODY]] ], [ [[TMP19]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
@@ -444,7 +444,7 @@ define i32 @reduction_phi_start_val(i32* %A, i64 %N) {
 ; CHECK-NEXT:    [[TMP5]] = sub <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
@@ -461,17 +461,17 @@ define i32 @reduction_phi_start_val(i32* %A, i64 %N) {
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI7:%.*]] = phi <4 x i32> [ [[TMP8]], [[VEC_EPILOG_PH]] ], [ [[TMP13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX6]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
 ; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP12]], align 4
 ; CHECK-NEXT:    [[TMP13]] = sub <4 x i32> [[VEC_PHI7]], [[WIDE_LOAD8]]
-; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[INDEX6]], 4
+; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[OFFSET_IDX]], 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC4]]
-; CHECK-NEXT:    br i1 [[TMP14]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP13]])
 ; CHECK-NEXT:    [[CMP_N5:%.*]] = icmp eq i64 [[N]], [[N_VEC4]]
@@ -488,7 +488,7 @@ define i32 @reduction_phi_start_val(i32* %A, i64 %N) {
 ; CHECK-NEXT:    [[SUB]] = sub nsw i32 [[SUM]], [[LOAD]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       for.cond.loopexit:
 ; CHECK-NEXT:    [[SUB_LCSSA2:%.*]] = phi i32 [ [[SUB]], [[FOR_BODY]] ], [ [[TMP15]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_COND]]
diff --git a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll
index 1756a44..5d17783 100644
--- a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll
+++ b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll
@@ -40,7 +40,7 @@ define signext i32 @f1(i32* noalias %A, i32* noalias %B, i32 signext %n) {
 ; VF-TWO-CHECK-NEXT:    [[TMP7:%.*]] = add nsw <2 x i32> [[WIDE_LOAD]], [[WIDE_LOAD2]]
 ; VF-TWO-CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF-TWO-CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; VF-TWO-CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
+; VF-TWO-CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; VF-TWO-CHECK:       middle.block:
 ; VF-TWO-CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; VF-TWO-CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32> [[TMP7]], i32 1
@@ -55,24 +55,24 @@ define signext i32 @f1(i32* noalias %A, i32* noalias %B, i32 signext %n) {
 ; VF-TWO-CHECK-NEXT:    [[N_VEC5:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF4]]
 ; VF-TWO-CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; VF-TWO-CHECK:       vec.epilog.vector.body:
-; VF-TWO-CHECK-NEXT:    [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT7:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; VF-TWO-CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX6]], 0
+; VF-TWO-CHECK-NEXT:    [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT10:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; VF-TWO-CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; VF-TWO-CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP10]]
 ; VF-TWO-CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0
 ; VF-TWO-CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <2 x i32>*
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <2 x i32>, <2 x i32>* [[TMP13]], align 4
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <2 x i32>, <2 x i32>* [[TMP13]], align 4
 ; VF-TWO-CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP10]]
 ; VF-TWO-CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i32 0
 ; VF-TWO-CHECK-NEXT:    [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <2 x i32>*
-; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <2 x i32>, <2 x i32>* [[TMP16]], align 4
-; VF-TWO-CHECK-NEXT:    [[TMP17:%.*]] = add nsw <2 x i32> [[WIDE_LOAD9]], [[WIDE_LOAD10]]
-; VF-TWO-CHECK-NEXT:    [[INDEX_NEXT7]] = add nuw i64 [[INDEX6]], 2
-; VF-TWO-CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT7]], [[N_VEC5]]
-; VF-TWO-CHECK-NEXT:    br i1 [[TMP18]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], [[LOOP2:!llvm.loop !.*]]
+; VF-TWO-CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <2 x i32>, <2 x i32>* [[TMP16]], align 4
+; VF-TWO-CHECK-NEXT:    [[TMP17:%.*]] = add nsw <2 x i32> [[WIDE_LOAD8]], [[WIDE_LOAD9]]
+; VF-TWO-CHECK-NEXT:    [[INDEX_NEXT10]] = add nuw i64 [[OFFSET_IDX]], 2
+; VF-TWO-CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT10]], [[N_VEC5]]
+; VF-TWO-CHECK-NEXT:    br i1 [[TMP18]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; VF-TWO-CHECK:       vec.epilog.middle.block:
-; VF-TWO-CHECK-NEXT:    [[CMP_N8:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC5]]
+; VF-TWO-CHECK-NEXT:    [[CMP_N6:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC5]]
 ; VF-TWO-CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x i32> [[TMP17]], i32 1
-; VF-TWO-CHECK-NEXT:    br i1 [[CMP_N8]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
+; VF-TWO-CHECK-NEXT:    br i1 [[CMP_N6]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
 ; VF-TWO-CHECK:       vec.epilog.scalar.ph:
 ; VF-TWO-CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ]
 ; VF-TWO-CHECK-NEXT:    br label [[FOR_BODY:%.*]]
@@ -85,7 +85,7 @@ define signext i32 @f1(i32* noalias %A, i32* noalias %B, i32 signext %n) {
 ; VF-TWO-CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
 ; VF-TWO-CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; VF-TWO-CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; VF-TWO-CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT_LOOPEXIT]], [[LOOP4:!llvm.loop !.*]]
+; VF-TWO-CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOP4:![0-9]+]]
 ; VF-TWO-CHECK:       for.end.loopexit.loopexit:
 ; VF-TWO-CHECK-NEXT:    [[ADD_LCSSA3:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP19]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; VF-TWO-CHECK-NEXT:    br label [[FOR_END_LOOPEXIT]]
-- 
2.7.4