From d92cec4c96eb85096cc64643764e1370fc08eafc Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 17 May 2022 17:44:54 +0100 Subject: [PATCH] [LV] Regenerate check lines for some tests. Make sure the auto-generated check lines are up-to-date for some files, to reduce the test diff in upcoming changes --- .../AArch64/sve-epilog-vect-inloop-reductions.ll | 6 +- .../AArch64/sve-epilog-vect-reductions.ll | 8 +- .../AArch64/sve-epilog-vect-strict-reductions.ll | 8 +- .../PowerPC/optimal-epilog-vectorization.ll | 600 ++++++------- .../LoopVectorize/X86/invariant-load-gather.ll | 32 +- .../LoopVectorize/X86/limit-vf-by-tripcount.ll | 16 +- .../LoopVectorize/X86/masked_load_store.ll | 952 ++++++++++----------- .../epilog-vectorization-reductions.ll | 50 +- .../optimal-epilog-vectorization-liveout.ll | 24 +- 9 files changed, 848 insertions(+), 848 deletions(-) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll index b8f941a..a1b0e43 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll @@ -64,16 +64,16 @@ define i64 @int_reduction_and(i64* noalias nocapture %a, i64 %N) { ; CHECK-NEXT: [[N_VEC6:%.*]] = sub i64 [[N]], [[N_MOD_VF5]] ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX8:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI9:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[TMP30:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[INDEX8]], 0 +; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 0 ; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP25]] ; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i64, i64* [[TMP26]], i32 0 ; CHECK-NEXT: [[TMP28:%.*]] = bitcast i64* [[TMP27]] to <2 x i64>* ; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <2 x i64>, <2 x i64>* [[TMP28]], align 4 ; CHECK-NEXT: [[TMP29:%.*]] = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> [[WIDE_LOAD10]]) ; CHECK-NEXT: [[TMP30]] = and i64 [[TMP29]], [[VEC_PHI9]] -; CHECK-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[INDEX8]], 2 +; CHECK-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[OFFSET_IDX]], 2 ; CHECK-NEXT: [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC6]] ; CHECK-NEXT: br i1 [[TMP31]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: vec.epilog.middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll index e3e219c..7b2cbca 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll @@ -64,15 +64,15 @@ define i64 @int_reduction_add(i64* %a, i64 %N) { ; CHECK-NEXT: [[TMP24:%.*]] = insertelement <2 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0 ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX8:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI9:%.*]] = phi <2 x i64> [ [[TMP24]], [[VEC_EPILOG_PH]] ], [ [[TMP29:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[INDEX8]], 0 +; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 0 ; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP25]] ; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i64, i64* [[TMP26]], i32 0 ; CHECK-NEXT: [[TMP28:%.*]] = bitcast i64* [[TMP27]] to <2 x i64>* ; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <2 x i64>, <2 x i64>* [[TMP28]], align 4 ; CHECK-NEXT: [[TMP29]] = add <2 x i64> [[WIDE_LOAD10]], [[VEC_PHI9]] -; CHECK-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[INDEX8]], 2 +; CHECK-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[OFFSET_IDX]], 2 ; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC6]] ; CHECK-NEXT: br i1 [[TMP30]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: vec.epilog.middle.block: @@ -91,7 +91,7 @@ define i64 @int_reduction_add(i64* %a, i64 %N) { ; CHECK-NEXT: [[ADD]] = add i64 [[TMP32]], [[SUM]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: for.end.loopexit: ; CHECK-NEXT: [[ADD_LCSSA4:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP31]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_END]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll index a32a672..61b4feb 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll @@ -60,15 +60,15 @@ define float @fadd_strict(float* noalias nocapture readonly %a, i64 %n) { ; CHECK-NEXT: [[N_VEC5:%.*]] = sub i64 [[N]], [[N_MOD_VF4]] ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX7:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT10:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT10:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI8:%.*]] = phi float [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[TMP27:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP23:%.*]] = add i64 [[INDEX7]], 0 +; CHECK-NEXT: [[TMP23:%.*]] = add i64 [[OFFSET_IDX]], 0 ; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP23]] ; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, float* [[TMP24]], i32 0 ; CHECK-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP25]] to <2 x float>* ; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <2 x float>, <2 x float>* [[TMP26]], align 4 ; CHECK-NEXT: [[TMP27]] = call float @llvm.vector.reduce.fadd.v2f32(float [[VEC_PHI8]], <2 x float> [[WIDE_LOAD9]]) -; CHECK-NEXT: [[INDEX_NEXT10]] = add nuw i64 [[INDEX7]], 2 +; CHECK-NEXT: [[INDEX_NEXT10]] = add nuw i64 [[OFFSET_IDX]], 2 ; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT10]], [[N_VEC5]] ; CHECK-NEXT: br i1 [[TMP28]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: vec.epilog.middle.block: @@ -86,7 +86,7 @@ define float @fadd_strict(float* noalias nocapture readonly %a, i64 %n) { ; CHECK-NEXT: [[ADD]] = fadd float [[TMP29]], [[SUM_07]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: for.end.loopexit: ; CHECK-NEXT: [[ADD_LCSSA3:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP27]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_END]] diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll index 5084720..d71bbe2 100644 --- a/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll @@ -207,27 +207,27 @@ define dso_local void @f1(float* noalias %aa, float* noalias %bb, float* noalias ; VF-TWO-CHECK-NEXT: [[N_VEC26:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF25]] ; VF-TWO-CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; VF-TWO-CHECK: vec.epilog.vector.body: -; VF-TWO-CHECK-NEXT: [[INDEX27:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT28:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; VF-TWO-CHECK-NEXT: [[TMP133:%.*]] = add i64 [[INDEX27]], 0 +; VF-TWO-CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT31:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; VF-TWO-CHECK-NEXT: [[TMP133:%.*]] = add i64 [[OFFSET_IDX]], 0 ; VF-TWO-CHECK-NEXT: [[TMP134:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP133]] ; VF-TWO-CHECK-NEXT: [[TMP135:%.*]] = getelementptr inbounds float, float* [[TMP134]], i32 0 ; VF-TWO-CHECK-NEXT: [[TMP136:%.*]] = bitcast float* [[TMP135]] to <2 x float>* -; VF-TWO-CHECK-NEXT: [[WIDE_LOAD30:%.*]] = load <2 x float>, <2 x float>* [[TMP136]], align 4 +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD29:%.*]] = load <2 x float>, <2 x float>* [[TMP136]], align 4 ; VF-TWO-CHECK-NEXT: [[TMP137:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP133]] ; VF-TWO-CHECK-NEXT: [[TMP138:%.*]] = getelementptr inbounds float, float* [[TMP137]], i32 0 ; VF-TWO-CHECK-NEXT: [[TMP139:%.*]] = bitcast float* [[TMP138]] to <2 x float>* -; VF-TWO-CHECK-NEXT: [[WIDE_LOAD31:%.*]] = load <2 x float>, <2 x float>* [[TMP139]], align 4 -; VF-TWO-CHECK-NEXT: [[TMP140:%.*]] = fadd fast <2 x float> [[WIDE_LOAD30]], [[WIDE_LOAD31]] +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD30:%.*]] = load <2 x float>, <2 x float>* [[TMP139]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP140:%.*]] = fadd fast <2 x float> [[WIDE_LOAD29]], [[WIDE_LOAD30]] ; VF-TWO-CHECK-NEXT: [[TMP141:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP133]] ; VF-TWO-CHECK-NEXT: [[TMP142:%.*]] = getelementptr inbounds float, float* [[TMP141]], i32 0 ; VF-TWO-CHECK-NEXT: [[TMP143:%.*]] = bitcast float* [[TMP142]] to <2 x float>* ; VF-TWO-CHECK-NEXT: store <2 x float> [[TMP140]], <2 x float>* [[TMP143]], align 4 -; VF-TWO-CHECK-NEXT: [[INDEX_NEXT28]] = add nuw i64 [[INDEX27]], 2 -; VF-TWO-CHECK-NEXT: [[TMP144:%.*]] = icmp eq i64 [[INDEX_NEXT28]], [[N_VEC26]] +; VF-TWO-CHECK-NEXT: [[INDEX_NEXT31]] = add nuw i64 [[OFFSET_IDX]], 2 +; VF-TWO-CHECK-NEXT: [[TMP144:%.*]] = icmp eq i64 [[INDEX_NEXT31]], [[N_VEC26]] ; VF-TWO-CHECK-NEXT: br i1 [[TMP144]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOPID_EV:![0-9]+]] ; VF-TWO-CHECK: vec.epilog.middle.block: -; VF-TWO-CHECK-NEXT: [[CMP_N29:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC26]] -; VF-TWO-CHECK-NEXT: br i1 [[CMP_N29]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]] +; VF-TWO-CHECK-NEXT: [[CMP_N27:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC26]] +; VF-TWO-CHECK-NEXT: br i1 [[CMP_N27]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]] ; VF-TWO-CHECK: vec.epilog.scalar.ph: ; VF-TWO-CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC26]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] ; VF-TWO-CHECK-NEXT: br label [[FOR_BODY:%.*]] @@ -451,27 +451,27 @@ define dso_local void @f1(float* noalias %aa, float* noalias %bb, float* noalias ; VF-FOUR-CHECK-NEXT: [[N_VEC26:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF25]] ; VF-FOUR-CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; VF-FOUR-CHECK: vec.epilog.vector.body: -; VF-FOUR-CHECK-NEXT: [[INDEX27:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT28:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; VF-FOUR-CHECK-NEXT: [[TMP133:%.*]] = add i64 [[INDEX27]], 0 +; VF-FOUR-CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT31:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; VF-FOUR-CHECK-NEXT: [[TMP133:%.*]] = add i64 [[OFFSET_IDX]], 0 ; VF-FOUR-CHECK-NEXT: [[TMP134:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP133]] ; VF-FOUR-CHECK-NEXT: [[TMP135:%.*]] = getelementptr inbounds float, float* [[TMP134]], i32 0 ; VF-FOUR-CHECK-NEXT: [[TMP136:%.*]] = bitcast float* [[TMP135]] to <4 x float>* -; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD30:%.*]] = load <4 x float>, <4 x float>* [[TMP136]], align 4 +; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD29:%.*]] = load <4 x float>, <4 x float>* [[TMP136]], align 4 ; VF-FOUR-CHECK-NEXT: [[TMP137:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP133]] ; VF-FOUR-CHECK-NEXT: [[TMP138:%.*]] = getelementptr inbounds float, float* [[TMP137]], i32 0 ; VF-FOUR-CHECK-NEXT: [[TMP139:%.*]] = bitcast float* [[TMP138]] to <4 x float>* -; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD31:%.*]] = load <4 x float>, <4 x float>* [[TMP139]], align 4 -; VF-FOUR-CHECK-NEXT: [[TMP140:%.*]] = fadd fast <4 x float> [[WIDE_LOAD30]], [[WIDE_LOAD31]] +; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD30:%.*]] = load <4 x float>, <4 x float>* [[TMP139]], align 4 +; VF-FOUR-CHECK-NEXT: [[TMP140:%.*]] = fadd fast <4 x float> [[WIDE_LOAD29]], [[WIDE_LOAD30]] ; VF-FOUR-CHECK-NEXT: [[TMP141:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP133]] ; VF-FOUR-CHECK-NEXT: [[TMP142:%.*]] = getelementptr inbounds float, float* [[TMP141]], i32 0 ; VF-FOUR-CHECK-NEXT: [[TMP143:%.*]] = bitcast float* [[TMP142]] to <4 x float>* ; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP140]], <4 x float>* [[TMP143]], align 4 -; VF-FOUR-CHECK-NEXT: [[INDEX_NEXT28]] = add nuw i64 [[INDEX27]], 4 -; VF-FOUR-CHECK-NEXT: [[TMP144:%.*]] = icmp eq i64 [[INDEX_NEXT28]], [[N_VEC26]] +; VF-FOUR-CHECK-NEXT: [[INDEX_NEXT31]] = add nuw i64 [[OFFSET_IDX]], 4 +; VF-FOUR-CHECK-NEXT: [[TMP144:%.*]] = icmp eq i64 [[INDEX_NEXT31]], [[N_VEC26]] ; VF-FOUR-CHECK-NEXT: br i1 [[TMP144]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; VF-FOUR-CHECK: vec.epilog.middle.block: -; VF-FOUR-CHECK-NEXT: [[CMP_N29:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC26]] -; VF-FOUR-CHECK-NEXT: br i1 [[CMP_N29]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]] +; VF-FOUR-CHECK-NEXT: [[CMP_N27:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC26]] +; VF-FOUR-CHECK-NEXT: br i1 [[CMP_N27]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]] ; VF-FOUR-CHECK: vec.epilog.scalar.ph: ; VF-FOUR-CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC26]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] ; VF-FOUR-CHECK-NEXT: br label [[FOR_BODY:%.*]] @@ -540,12 +540,12 @@ define dso_local signext i32 @f2(float* noalias %A, float* noalias %B, i32 signe ; VF-TWO-CHECK-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP2]]) ; VF-TWO-CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0 ; VF-TWO-CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1 -; VF-TWO-CHECK-NEXT: [[TMP4:%.*]] = sub i32 [[TMP0]], [[MUL_RESULT]] -; VF-TWO-CHECK-NEXT: [[TMP5:%.*]] = icmp sgt i32 [[TMP4]], [[TMP0]] -; VF-TWO-CHECK-NEXT: [[TMP9:%.*]] = or i1 [[TMP5]], [[MUL_OVERFLOW]] -; VF-TWO-CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[TMP1]], 4294967295 -; VF-TWO-CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP9]], [[TMP8]] -; VF-TWO-CHECK-NEXT: br i1 [[TMP10]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; VF-TWO-CHECK-NEXT: [[TMP3:%.*]] = sub i32 [[TMP0]], [[MUL_RESULT]] +; VF-TWO-CHECK-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP3]], [[TMP0]] +; VF-TWO-CHECK-NEXT: [[TMP5:%.*]] = or i1 [[TMP4]], [[MUL_OVERFLOW]] +; VF-TWO-CHECK-NEXT: [[TMP6:%.*]] = icmp ugt i64 [[TMP1]], 4294967295 +; VF-TWO-CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]] +; VF-TWO-CHECK-NEXT: br i1 [[TMP7]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; VF-TWO-CHECK: vector.main.loop.iter.check: ; VF-TWO-CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 32 ; VF-TWO-CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] @@ -556,142 +556,142 @@ define dso_local signext i32 @f2(float* noalias %A, float* noalias %B, i32 signe ; VF-TWO-CHECK: vector.body: ; VF-TWO-CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; VF-TWO-CHECK-NEXT: [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32 -; VF-TWO-CHECK-NEXT: [[TMP20:%.*]] = add i32 [[OFFSET_IDX]], 0 -; VF-TWO-CHECK-NEXT: [[TMP21:%.*]] = add i32 [[OFFSET_IDX]], 4 -; VF-TWO-CHECK-NEXT: [[TMP22:%.*]] = add i32 [[OFFSET_IDX]], 8 -; VF-TWO-CHECK-NEXT: [[TMP23:%.*]] = add i32 [[OFFSET_IDX]], 12 -; VF-TWO-CHECK-NEXT: [[TMP24:%.*]] = add i32 [[OFFSET_IDX]], 16 -; VF-TWO-CHECK-NEXT: [[TMP25:%.*]] = add i32 [[OFFSET_IDX]], 20 -; VF-TWO-CHECK-NEXT: [[TMP26:%.*]] = add i32 [[OFFSET_IDX]], 24 -; VF-TWO-CHECK-NEXT: [[TMP27:%.*]] = add i32 [[OFFSET_IDX]], 28 -; VF-TWO-CHECK-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0 -; VF-TWO-CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 4 -; VF-TWO-CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 8 -; VF-TWO-CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 12 -; VF-TWO-CHECK-NEXT: [[TMP16:%.*]] = add i64 [[INDEX]], 16 -; VF-TWO-CHECK-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 20 -; VF-TWO-CHECK-NEXT: [[TMP18:%.*]] = add i64 [[INDEX]], 24 -; VF-TWO-CHECK-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], 28 -; VF-TWO-CHECK-NEXT: [[TMP28:%.*]] = xor i32 [[TMP20]], -1 -; VF-TWO-CHECK-NEXT: [[TMP29:%.*]] = xor i32 [[TMP21]], -1 -; VF-TWO-CHECK-NEXT: [[TMP30:%.*]] = xor i32 [[TMP22]], -1 -; VF-TWO-CHECK-NEXT: [[TMP31:%.*]] = xor i32 [[TMP23]], -1 -; VF-TWO-CHECK-NEXT: [[TMP32:%.*]] = xor i32 [[TMP24]], -1 -; VF-TWO-CHECK-NEXT: [[TMP33:%.*]] = xor i32 [[TMP25]], -1 -; VF-TWO-CHECK-NEXT: [[TMP34:%.*]] = xor i32 [[TMP26]], -1 -; VF-TWO-CHECK-NEXT: [[TMP35:%.*]] = xor i32 [[TMP27]], -1 +; VF-TWO-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[OFFSET_IDX]], 0 +; VF-TWO-CHECK-NEXT: [[TMP9:%.*]] = add i32 [[OFFSET_IDX]], 4 +; VF-TWO-CHECK-NEXT: [[TMP10:%.*]] = add i32 [[OFFSET_IDX]], 8 +; VF-TWO-CHECK-NEXT: [[TMP11:%.*]] = add i32 [[OFFSET_IDX]], 12 +; VF-TWO-CHECK-NEXT: [[TMP12:%.*]] = add i32 [[OFFSET_IDX]], 16 +; VF-TWO-CHECK-NEXT: [[TMP13:%.*]] = add i32 [[OFFSET_IDX]], 20 +; VF-TWO-CHECK-NEXT: [[TMP14:%.*]] = add i32 [[OFFSET_IDX]], 24 +; VF-TWO-CHECK-NEXT: [[TMP15:%.*]] = add i32 [[OFFSET_IDX]], 28 +; VF-TWO-CHECK-NEXT: [[TMP16:%.*]] = add i64 [[INDEX]], 0 +; VF-TWO-CHECK-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 4 +; VF-TWO-CHECK-NEXT: [[TMP18:%.*]] = add i64 [[INDEX]], 8 +; VF-TWO-CHECK-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], 12 +; VF-TWO-CHECK-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 16 +; VF-TWO-CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 20 +; VF-TWO-CHECK-NEXT: [[TMP22:%.*]] = add i64 [[INDEX]], 24 +; VF-TWO-CHECK-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], 28 +; VF-TWO-CHECK-NEXT: [[TMP24:%.*]] = xor i32 [[TMP8]], -1 +; VF-TWO-CHECK-NEXT: [[TMP25:%.*]] = xor i32 [[TMP9]], -1 +; VF-TWO-CHECK-NEXT: [[TMP26:%.*]] = xor i32 [[TMP10]], -1 +; VF-TWO-CHECK-NEXT: [[TMP27:%.*]] = xor i32 [[TMP11]], -1 +; VF-TWO-CHECK-NEXT: [[TMP28:%.*]] = xor i32 [[TMP12]], -1 +; VF-TWO-CHECK-NEXT: [[TMP29:%.*]] = xor i32 [[TMP13]], -1 +; VF-TWO-CHECK-NEXT: [[TMP30:%.*]] = xor i32 [[TMP14]], -1 +; VF-TWO-CHECK-NEXT: [[TMP31:%.*]] = xor i32 [[TMP15]], -1 +; VF-TWO-CHECK-NEXT: [[TMP32:%.*]] = add i32 [[TMP24]], [[N]] +; VF-TWO-CHECK-NEXT: [[TMP33:%.*]] = add i32 [[TMP25]], [[N]] +; VF-TWO-CHECK-NEXT: [[TMP34:%.*]] = add i32 [[TMP26]], [[N]] +; VF-TWO-CHECK-NEXT: [[TMP35:%.*]] = add i32 [[TMP27]], [[N]] ; VF-TWO-CHECK-NEXT: [[TMP36:%.*]] = add i32 [[TMP28]], [[N]] ; VF-TWO-CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP29]], [[N]] ; VF-TWO-CHECK-NEXT: [[TMP38:%.*]] = add i32 [[TMP30]], [[N]] ; VF-TWO-CHECK-NEXT: [[TMP39:%.*]] = add i32 [[TMP31]], [[N]] -; VF-TWO-CHECK-NEXT: [[TMP40:%.*]] = add i32 [[TMP32]], [[N]] -; VF-TWO-CHECK-NEXT: [[TMP41:%.*]] = add i32 [[TMP33]], [[N]] -; VF-TWO-CHECK-NEXT: [[TMP42:%.*]] = add i32 [[TMP34]], [[N]] -; VF-TWO-CHECK-NEXT: [[TMP43:%.*]] = add i32 [[TMP35]], [[N]] +; VF-TWO-CHECK-NEXT: [[TMP40:%.*]] = sext i32 [[TMP32]] to i64 +; VF-TWO-CHECK-NEXT: [[TMP41:%.*]] = sext i32 [[TMP33]] to i64 +; VF-TWO-CHECK-NEXT: [[TMP42:%.*]] = sext i32 [[TMP34]] to i64 +; VF-TWO-CHECK-NEXT: [[TMP43:%.*]] = sext i32 [[TMP35]] to i64 ; VF-TWO-CHECK-NEXT: [[TMP44:%.*]] = sext i32 [[TMP36]] to i64 ; VF-TWO-CHECK-NEXT: [[TMP45:%.*]] = sext i32 [[TMP37]] to i64 ; VF-TWO-CHECK-NEXT: [[TMP46:%.*]] = sext i32 [[TMP38]] to i64 ; VF-TWO-CHECK-NEXT: [[TMP47:%.*]] = sext i32 [[TMP39]] to i64 -; VF-TWO-CHECK-NEXT: [[TMP48:%.*]] = sext i32 [[TMP40]] to i64 -; VF-TWO-CHECK-NEXT: [[TMP49:%.*]] = sext i32 [[TMP41]] to i64 -; VF-TWO-CHECK-NEXT: [[TMP50:%.*]] = sext i32 [[TMP42]] to i64 -; VF-TWO-CHECK-NEXT: [[TMP51:%.*]] = sext i32 [[TMP43]] to i64 -; VF-TWO-CHECK-NEXT: [[TMP52:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP44]] +; VF-TWO-CHECK-NEXT: [[TMP48:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP40]] +; VF-TWO-CHECK-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP41]] +; VF-TWO-CHECK-NEXT: [[TMP50:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP42]] +; VF-TWO-CHECK-NEXT: [[TMP51:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP43]] +; VF-TWO-CHECK-NEXT: [[TMP52:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP44]] ; VF-TWO-CHECK-NEXT: [[TMP53:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP45]] ; VF-TWO-CHECK-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP46]] ; VF-TWO-CHECK-NEXT: [[TMP55:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP47]] -; VF-TWO-CHECK-NEXT: [[TMP56:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP48]] -; VF-TWO-CHECK-NEXT: [[TMP57:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP49]] -; VF-TWO-CHECK-NEXT: [[TMP58:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP50]] -; VF-TWO-CHECK-NEXT: [[TMP59:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP51]] -; VF-TWO-CHECK-NEXT: [[TMP60:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 0 -; VF-TWO-CHECK-NEXT: [[TMP61:%.*]] = getelementptr inbounds float, float* [[TMP60]], i32 -3 -; VF-TWO-CHECK-NEXT: [[TMP62:%.*]] = bitcast float* [[TMP61]] to <4 x float>* -; VF-TWO-CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP62]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP56:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 0 +; VF-TWO-CHECK-NEXT: [[TMP57:%.*]] = getelementptr inbounds float, float* [[TMP56]], i32 -3 +; VF-TWO-CHECK-NEXT: [[TMP58:%.*]] = bitcast float* [[TMP57]] to <4 x float>* +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP58]], align 4 ; VF-TWO-CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x float> [[WIDE_LOAD]], <4 x float> poison, <4 x i32> -; VF-TWO-CHECK-NEXT: [[TMP63:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -4 -; VF-TWO-CHECK-NEXT: [[TMP64:%.*]] = getelementptr inbounds float, float* [[TMP63]], i32 -3 -; VF-TWO-CHECK-NEXT: [[TMP65:%.*]] = bitcast float* [[TMP64]] to <4 x float>* -; VF-TWO-CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, <4 x float>* [[TMP65]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP59:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -4 +; VF-TWO-CHECK-NEXT: [[TMP60:%.*]] = getelementptr inbounds float, float* [[TMP59]], i32 -3 +; VF-TWO-CHECK-NEXT: [[TMP61:%.*]] = bitcast float* [[TMP60]] to <4 x float>* +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, <4 x float>* [[TMP61]], align 4 ; VF-TWO-CHECK-NEXT: [[REVERSE3:%.*]] = shufflevector <4 x float> [[WIDE_LOAD2]], <4 x float> poison, <4 x i32> -; VF-TWO-CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -8 -; VF-TWO-CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds float, float* [[TMP66]], i32 -3 -; VF-TWO-CHECK-NEXT: [[TMP68:%.*]] = bitcast float* [[TMP67]] to <4 x float>* -; VF-TWO-CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x float>, <4 x float>* [[TMP68]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP62:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -8 +; VF-TWO-CHECK-NEXT: [[TMP63:%.*]] = getelementptr inbounds float, float* [[TMP62]], i32 -3 +; VF-TWO-CHECK-NEXT: [[TMP64:%.*]] = bitcast float* [[TMP63]] to <4 x float>* +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x float>, <4 x float>* [[TMP64]], align 4 ; VF-TWO-CHECK-NEXT: [[REVERSE5:%.*]] = shufflevector <4 x float> [[WIDE_LOAD4]], <4 x float> poison, <4 x i32> -; VF-TWO-CHECK-NEXT: [[TMP69:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -12 -; VF-TWO-CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds float, float* [[TMP69]], i32 -3 -; VF-TWO-CHECK-NEXT: [[TMP71:%.*]] = bitcast float* [[TMP70]] to <4 x float>* -; VF-TWO-CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x float>, <4 x float>* [[TMP71]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -12 +; VF-TWO-CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds float, float* [[TMP65]], i32 -3 +; VF-TWO-CHECK-NEXT: [[TMP67:%.*]] = bitcast float* [[TMP66]] to <4 x float>* +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x float>, <4 x float>* [[TMP67]], align 4 ; VF-TWO-CHECK-NEXT: [[REVERSE7:%.*]] = shufflevector <4 x float> [[WIDE_LOAD6]], <4 x float> poison, <4 x i32> -; VF-TWO-CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -16 -; VF-TWO-CHECK-NEXT: [[TMP73:%.*]] = getelementptr inbounds float, float* [[TMP72]], i32 -3 -; VF-TWO-CHECK-NEXT: [[TMP74:%.*]] = bitcast float* [[TMP73]] to <4 x float>* -; VF-TWO-CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x float>, <4 x float>* [[TMP74]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -16 +; VF-TWO-CHECK-NEXT: [[TMP69:%.*]] = getelementptr inbounds float, float* [[TMP68]], i32 -3 +; VF-TWO-CHECK-NEXT: [[TMP70:%.*]] = bitcast float* [[TMP69]] to <4 x float>* +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x float>, <4 x float>* [[TMP70]], align 4 ; VF-TWO-CHECK-NEXT: [[REVERSE9:%.*]] = shufflevector <4 x float> [[WIDE_LOAD8]], <4 x float> poison, <4 x i32> -; VF-TWO-CHECK-NEXT: [[TMP75:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -20 -; VF-TWO-CHECK-NEXT: [[TMP76:%.*]] = getelementptr inbounds float, float* [[TMP75]], i32 -3 -; VF-TWO-CHECK-NEXT: [[TMP77:%.*]] = bitcast float* [[TMP76]] to <4 x float>* -; VF-TWO-CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x float>, <4 x float>* [[TMP77]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP71:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -20 +; VF-TWO-CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds float, float* [[TMP71]], i32 -3 +; VF-TWO-CHECK-NEXT: [[TMP73:%.*]] = bitcast float* [[TMP72]] to <4 x float>* +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x float>, <4 x float>* [[TMP73]], align 4 ; VF-TWO-CHECK-NEXT: [[REVERSE11:%.*]] = shufflevector <4 x float> [[WIDE_LOAD10]], <4 x float> poison, <4 x i32> -; VF-TWO-CHECK-NEXT: [[TMP78:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -24 -; VF-TWO-CHECK-NEXT: [[TMP79:%.*]] = getelementptr inbounds float, float* [[TMP78]], i32 -3 -; VF-TWO-CHECK-NEXT: [[TMP80:%.*]] = bitcast float* [[TMP79]] to <4 x float>* -; VF-TWO-CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, <4 x float>* [[TMP80]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -24 +; VF-TWO-CHECK-NEXT: [[TMP75:%.*]] = getelementptr inbounds float, float* [[TMP74]], i32 -3 +; VF-TWO-CHECK-NEXT: [[TMP76:%.*]] = bitcast float* [[TMP75]] to <4 x float>* +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, <4 x float>* [[TMP76]], align 4 ; VF-TWO-CHECK-NEXT: [[REVERSE13:%.*]] = shufflevector <4 x float> [[WIDE_LOAD12]], <4 x float> poison, <4 x i32> -; VF-TWO-CHECK-NEXT: [[TMP81:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -28 -; VF-TWO-CHECK-NEXT: [[TMP82:%.*]] = getelementptr inbounds float, float* [[TMP81]], i32 -3 -; VF-TWO-CHECK-NEXT: [[TMP83:%.*]] = bitcast float* [[TMP82]] to <4 x float>* -; VF-TWO-CHECK-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x float>, <4 x float>* [[TMP83]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP77:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -28 +; VF-TWO-CHECK-NEXT: [[TMP78:%.*]] = getelementptr inbounds float, float* [[TMP77]], i32 -3 +; VF-TWO-CHECK-NEXT: [[TMP79:%.*]] = bitcast float* [[TMP78]] to <4 x float>* +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x float>, <4 x float>* [[TMP79]], align 4 ; VF-TWO-CHECK-NEXT: [[REVERSE15:%.*]] = shufflevector <4 x float> [[WIDE_LOAD14]], <4 x float> poison, <4 x i32> -; VF-TWO-CHECK-NEXT: [[TMP84:%.*]] = fadd fast <4 x float> [[REVERSE]], -; VF-TWO-CHECK-NEXT: [[TMP85:%.*]] = fadd fast <4 x float> [[REVERSE3]], -; VF-TWO-CHECK-NEXT: [[TMP86:%.*]] = fadd fast <4 x float> [[REVERSE5]], -; VF-TWO-CHECK-NEXT: [[TMP87:%.*]] = fadd fast <4 x float> [[REVERSE7]], -; VF-TWO-CHECK-NEXT: [[TMP88:%.*]] = fadd fast <4 x float> [[REVERSE9]], -; VF-TWO-CHECK-NEXT: [[TMP89:%.*]] = fadd fast <4 x float> [[REVERSE11]], -; VF-TWO-CHECK-NEXT: [[TMP90:%.*]] = fadd fast <4 x float> [[REVERSE13]], -; VF-TWO-CHECK-NEXT: [[TMP91:%.*]] = fadd fast <4 x float> [[REVERSE15]], -; VF-TWO-CHECK-NEXT: [[TMP92:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP12]] -; VF-TWO-CHECK-NEXT: [[TMP93:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP13]] -; VF-TWO-CHECK-NEXT: [[TMP94:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP14]] -; VF-TWO-CHECK-NEXT: [[TMP95:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP15]] -; VF-TWO-CHECK-NEXT: [[TMP96:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP16]] -; VF-TWO-CHECK-NEXT: [[TMP97:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP17]] -; VF-TWO-CHECK-NEXT: [[TMP98:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP18]] -; VF-TWO-CHECK-NEXT: [[TMP99:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP19]] -; VF-TWO-CHECK-NEXT: [[TMP100:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 0 +; VF-TWO-CHECK-NEXT: [[TMP80:%.*]] = fadd fast <4 x float> [[REVERSE]], +; VF-TWO-CHECK-NEXT: [[TMP81:%.*]] = fadd fast <4 x float> [[REVERSE3]], +; VF-TWO-CHECK-NEXT: [[TMP82:%.*]] = fadd fast <4 x float> [[REVERSE5]], +; VF-TWO-CHECK-NEXT: [[TMP83:%.*]] = fadd fast <4 x float> [[REVERSE7]], +; VF-TWO-CHECK-NEXT: [[TMP84:%.*]] = fadd fast <4 x float> [[REVERSE9]], +; VF-TWO-CHECK-NEXT: [[TMP85:%.*]] = fadd fast <4 x float> [[REVERSE11]], +; VF-TWO-CHECK-NEXT: [[TMP86:%.*]] = fadd fast <4 x float> [[REVERSE13]], +; VF-TWO-CHECK-NEXT: [[TMP87:%.*]] = fadd fast <4 x float> [[REVERSE15]], +; VF-TWO-CHECK-NEXT: [[TMP88:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP16]] +; VF-TWO-CHECK-NEXT: [[TMP89:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP17]] +; VF-TWO-CHECK-NEXT: [[TMP90:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP18]] +; VF-TWO-CHECK-NEXT: [[TMP91:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP19]] +; VF-TWO-CHECK-NEXT: [[TMP92:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP20]] +; VF-TWO-CHECK-NEXT: [[TMP93:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP21]] +; VF-TWO-CHECK-NEXT: [[TMP94:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP22]] +; VF-TWO-CHECK-NEXT: [[TMP95:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP23]] +; VF-TWO-CHECK-NEXT: [[TMP96:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 0 +; VF-TWO-CHECK-NEXT: [[TMP97:%.*]] = bitcast float* [[TMP96]] to <4 x float>* +; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP80]], <4 x float>* [[TMP97]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP98:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 4 +; VF-TWO-CHECK-NEXT: [[TMP99:%.*]] = bitcast float* [[TMP98]] to <4 x float>* +; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP81]], <4 x float>* [[TMP99]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP100:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 8 ; VF-TWO-CHECK-NEXT: [[TMP101:%.*]] = bitcast float* [[TMP100]] to <4 x float>* -; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP84]], <4 x float>* [[TMP101]], align 4 -; VF-TWO-CHECK-NEXT: [[TMP102:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 4 +; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP82]], <4 x float>* [[TMP101]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP102:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 12 ; VF-TWO-CHECK-NEXT: [[TMP103:%.*]] = bitcast float* [[TMP102]] to <4 x float>* -; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP85]], <4 x float>* [[TMP103]], align 4 -; VF-TWO-CHECK-NEXT: [[TMP104:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 8 +; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP83]], <4 x float>* [[TMP103]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP104:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 16 ; VF-TWO-CHECK-NEXT: [[TMP105:%.*]] = bitcast float* [[TMP104]] to <4 x float>* -; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP86]], <4 x float>* [[TMP105]], align 4 -; VF-TWO-CHECK-NEXT: [[TMP106:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 12 +; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP84]], <4 x float>* [[TMP105]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP106:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 20 ; VF-TWO-CHECK-NEXT: [[TMP107:%.*]] = bitcast float* [[TMP106]] to <4 x float>* -; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP87]], <4 x float>* [[TMP107]], align 4 -; VF-TWO-CHECK-NEXT: [[TMP108:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 16 +; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP85]], <4 x float>* [[TMP107]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP108:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 24 ; VF-TWO-CHECK-NEXT: [[TMP109:%.*]] = bitcast float* [[TMP108]] to <4 x float>* -; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP88]], <4 x float>* [[TMP109]], align 4 -; VF-TWO-CHECK-NEXT: [[TMP110:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 20 +; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP86]], <4 x float>* [[TMP109]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP110:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 28 ; VF-TWO-CHECK-NEXT: [[TMP111:%.*]] = bitcast float* [[TMP110]] to <4 x float>* -; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP89]], <4 x float>* [[TMP111]], align 4 -; VF-TWO-CHECK-NEXT: [[TMP112:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 24 -; VF-TWO-CHECK-NEXT: [[TMP113:%.*]] = bitcast float* [[TMP112]] to <4 x float>* -; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP90]], <4 x float>* [[TMP113]], align 4 -; VF-TWO-CHECK-NEXT: [[TMP114:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 28 -; VF-TWO-CHECK-NEXT: [[TMP115:%.*]] = bitcast float* [[TMP114]] to <4 x float>* -; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP91]], <4 x float>* [[TMP115]], align 4 +; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP87]], <4 x float>* [[TMP111]], align 4 ; VF-TWO-CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; VF-TWO-CHECK-NEXT: [[TMP116:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VF-TWO-CHECK-NEXT: br i1 [[TMP116]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; VF-TWO-CHECK-NEXT: [[TMP112:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VF-TWO-CHECK-NEXT: br i1 [[TMP112]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; VF-TWO-CHECK: middle.block: ; VF-TWO-CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; VF-TWO-CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; VF-TWO-CHECK: vec.epilog.iter.check: -; VF-TWO-CHECK-NEXT: [[IND_END21:%.*]] = trunc i64 [[N_VEC]] to i32 +; VF-TWO-CHECK-NEXT: [[IND_END19:%.*]] = trunc i64 [[N_VEC]] to i32 ; VF-TWO-CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; VF-TWO-CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2 ; VF-TWO-CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] @@ -702,49 +702,49 @@ define dso_local signext i32 @f2(float* noalias %A, float* noalias %B, i32 signe ; VF-TWO-CHECK-NEXT: [[IND_END:%.*]] = trunc i64 [[N_VEC17]] to i32 ; VF-TWO-CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; VF-TWO-CHECK: vec.epilog.vector.body: -; VF-TWO-CHECK-NEXT: [[INDEX18:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT19:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; VF-TWO-CHECK-NEXT: [[OFFSET_IDX23:%.*]] = trunc i64 [[INDEX18]] to i32 -; VF-TWO-CHECK-NEXT: [[TMP118:%.*]] = add i32 [[OFFSET_IDX23]], 0 -; VF-TWO-CHECK-NEXT: [[TMP117:%.*]] = add i64 [[INDEX18]], 0 -; VF-TWO-CHECK-NEXT: [[TMP119:%.*]] = xor i32 [[TMP118]], -1 -; VF-TWO-CHECK-NEXT: [[TMP120:%.*]] = add i32 [[TMP119]], [[N]] -; VF-TWO-CHECK-NEXT: [[TMP121:%.*]] = sext i32 [[TMP120]] to i64 -; VF-TWO-CHECK-NEXT: [[TMP122:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP121]] -; VF-TWO-CHECK-NEXT: [[TMP123:%.*]] = getelementptr inbounds float, float* [[TMP122]], i32 0 -; VF-TWO-CHECK-NEXT: [[TMP124:%.*]] = getelementptr inbounds float, float* [[TMP123]], i32 -1 -; VF-TWO-CHECK-NEXT: [[TMP125:%.*]] = bitcast float* [[TMP124]] to <2 x float>* -; VF-TWO-CHECK-NEXT: [[WIDE_LOAD24:%.*]] = load <2 x float>, <2 x float>* [[TMP125]], align 4 +; VF-TWO-CHECK-NEXT: [[OFFSET_IDX23:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT26:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; VF-TWO-CHECK-NEXT: [[OFFSET_IDX22:%.*]] = trunc i64 [[OFFSET_IDX23]] to i32 +; VF-TWO-CHECK-NEXT: [[TMP113:%.*]] = add i32 [[OFFSET_IDX22]], 0 +; VF-TWO-CHECK-NEXT: [[TMP114:%.*]] = add i64 [[OFFSET_IDX23]], 0 +; VF-TWO-CHECK-NEXT: [[TMP115:%.*]] = xor i32 [[TMP113]], -1 +; VF-TWO-CHECK-NEXT: [[TMP116:%.*]] = add i32 [[TMP115]], [[N]] +; VF-TWO-CHECK-NEXT: [[TMP117:%.*]] = sext i32 [[TMP116]] to i64 +; VF-TWO-CHECK-NEXT: [[TMP118:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP117]] +; VF-TWO-CHECK-NEXT: [[TMP119:%.*]] = getelementptr inbounds float, float* [[TMP118]], i32 0 +; VF-TWO-CHECK-NEXT: [[TMP120:%.*]] = getelementptr inbounds float, float* [[TMP119]], i32 -1 +; VF-TWO-CHECK-NEXT: [[TMP121:%.*]] = bitcast float* [[TMP120]] to <2 x float>* +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD24:%.*]] = load <2 x float>, <2 x float>* [[TMP121]], align 4 ; VF-TWO-CHECK-NEXT: [[REVERSE25:%.*]] = shufflevector <2 x float> [[WIDE_LOAD24]], <2 x float> poison, <2 x i32> -; VF-TWO-CHECK-NEXT: [[TMP126:%.*]] = fadd fast <2 x float> [[REVERSE25]], -; VF-TWO-CHECK-NEXT: [[TMP127:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP117]] -; VF-TWO-CHECK-NEXT: [[TMP128:%.*]] = getelementptr inbounds float, float* [[TMP127]], i32 0 -; VF-TWO-CHECK-NEXT: [[TMP129:%.*]] = bitcast float* [[TMP128]] to <2 x float>* -; VF-TWO-CHECK-NEXT: store <2 x float> [[TMP126]], <2 x float>* [[TMP129]], align 4 -; VF-TWO-CHECK-NEXT: [[INDEX_NEXT19]] = add nuw i64 [[INDEX18]], 2 -; VF-TWO-CHECK-NEXT: [[TMP130:%.*]] = icmp eq i64 [[INDEX_NEXT19]], [[N_VEC17]] -; VF-TWO-CHECK-NEXT: br i1 [[TMP130]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; VF-TWO-CHECK-NEXT: [[TMP122:%.*]] = fadd fast <2 x float> [[REVERSE25]], +; VF-TWO-CHECK-NEXT: [[TMP123:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP114]] +; VF-TWO-CHECK-NEXT: [[TMP124:%.*]] = getelementptr inbounds float, float* [[TMP123]], i32 0 +; VF-TWO-CHECK-NEXT: [[TMP125:%.*]] = bitcast float* [[TMP124]] to <2 x float>* +; VF-TWO-CHECK-NEXT: store <2 x float> [[TMP122]], <2 x float>* [[TMP125]], align 4 +; VF-TWO-CHECK-NEXT: [[INDEX_NEXT26]] = add nuw i64 [[OFFSET_IDX23]], 2 +; VF-TWO-CHECK-NEXT: [[TMP126:%.*]] = icmp eq i64 [[INDEX_NEXT26]], [[N_VEC17]] +; VF-TWO-CHECK-NEXT: br i1 [[TMP126]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; VF-TWO-CHECK: vec.epilog.middle.block: -; VF-TWO-CHECK-NEXT: [[CMP_N22:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC17]] -; VF-TWO-CHECK-NEXT: br i1 [[CMP_N22]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]] +; VF-TWO-CHECK-NEXT: [[CMP_N20:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC17]] +; VF-TWO-CHECK-NEXT: br i1 [[CMP_N20]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]] ; VF-TWO-CHECK: vec.epilog.scalar.ph: ; VF-TWO-CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC17]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ] -; VF-TWO-CHECK-NEXT: [[BC_RESUME_VAL20:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END21]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ] +; VF-TWO-CHECK-NEXT: [[BC_RESUME_VAL18:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END19]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ] ; VF-TWO-CHECK-NEXT: br label [[FOR_BODY:%.*]] ; VF-TWO-CHECK: for.body: ; VF-TWO-CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; VF-TWO-CHECK-NEXT: [[I_014:%.*]] = phi i32 [ [[BC_RESUME_VAL20]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] -; VF-TWO-CHECK-NEXT: [[TMP131:%.*]] = xor i32 [[I_014]], -1 -; VF-TWO-CHECK-NEXT: [[SUB2:%.*]] = add i32 [[TMP131]], [[N]] +; VF-TWO-CHECK-NEXT: [[I_014:%.*]] = phi i32 [ [[BC_RESUME_VAL18]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; VF-TWO-CHECK-NEXT: [[TMP127:%.*]] = xor i32 [[I_014]], -1 +; VF-TWO-CHECK-NEXT: [[SUB2:%.*]] = add i32 [[TMP127]], [[N]] ; VF-TWO-CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[SUB2]] to i64 ; VF-TWO-CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IDXPROM]] -; VF-TWO-CHECK-NEXT: [[TMP132:%.*]] = load float, float* [[ARRAYIDX]], align 4 -; VF-TWO-CHECK-NEXT: [[CONV3:%.*]] = fadd fast float [[TMP132]], 1.000000e+00 +; VF-TWO-CHECK-NEXT: [[TMP128:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; VF-TWO-CHECK-NEXT: [[CONV3:%.*]] = fadd fast float [[TMP128]], 1.000000e+00 ; VF-TWO-CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]] ; VF-TWO-CHECK-NEXT: store float [[CONV3]], float* [[ARRAYIDX5]], align 4 ; VF-TWO-CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; VF-TWO-CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_014]], 1 ; VF-TWO-CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; VF-TWO-CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]] +; VF-TWO-CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOPID_MS_CM:![0-9]+]] ; VF-TWO-CHECK: for.end.loopexit.loopexit: ; VF-TWO-CHECK-NEXT: br label [[FOR_END_LOOPEXIT]] ; VF-TWO-CHECK: for.end.loopexit: @@ -767,12 +767,12 @@ define dso_local signext i32 @f2(float* noalias %A, float* noalias %B, i32 signe ; VF-FOUR-CHECK-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP2]]) ; VF-FOUR-CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0 ; VF-FOUR-CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1 -; VF-FOUR-CHECK-NEXT: [[TMP4:%.*]] = sub i32 [[TMP0]], [[MUL_RESULT]] -; VF-FOUR-CHECK-NEXT: [[TMP5:%.*]] = icmp sgt i32 [[TMP4]], [[TMP0]] -; VF-FOUR-CHECK-NEXT: [[TMP9:%.*]] = or i1 [[TMP5]], [[MUL_OVERFLOW]] -; VF-FOUR-CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[TMP1]], 4294967295 -; VF-FOUR-CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP9]], [[TMP8]] -; VF-FOUR-CHECK-NEXT: br i1 [[TMP10]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; VF-FOUR-CHECK-NEXT: [[TMP3:%.*]] = sub i32 [[TMP0]], [[MUL_RESULT]] +; VF-FOUR-CHECK-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP3]], [[TMP0]] +; VF-FOUR-CHECK-NEXT: [[TMP5:%.*]] = or i1 [[TMP4]], [[MUL_OVERFLOW]] +; VF-FOUR-CHECK-NEXT: [[TMP6:%.*]] = icmp ugt i64 [[TMP1]], 4294967295 +; VF-FOUR-CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]] +; VF-FOUR-CHECK-NEXT: br i1 [[TMP7]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; VF-FOUR-CHECK: vector.main.loop.iter.check: ; VF-FOUR-CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 32 ; VF-FOUR-CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] @@ -783,142 +783,142 @@ define dso_local signext i32 @f2(float* noalias %A, float* noalias %B, i32 signe ; VF-FOUR-CHECK: vector.body: ; VF-FOUR-CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; VF-FOUR-CHECK-NEXT: [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32 -; VF-FOUR-CHECK-NEXT: [[TMP20:%.*]] = add i32 [[OFFSET_IDX]], 0 -; VF-FOUR-CHECK-NEXT: [[TMP21:%.*]] = add i32 [[OFFSET_IDX]], 4 -; VF-FOUR-CHECK-NEXT: [[TMP22:%.*]] = add i32 [[OFFSET_IDX]], 8 -; VF-FOUR-CHECK-NEXT: [[TMP23:%.*]] = add i32 [[OFFSET_IDX]], 12 -; VF-FOUR-CHECK-NEXT: [[TMP24:%.*]] = add i32 [[OFFSET_IDX]], 16 -; VF-FOUR-CHECK-NEXT: [[TMP25:%.*]] = add i32 [[OFFSET_IDX]], 20 -; VF-FOUR-CHECK-NEXT: [[TMP26:%.*]] = add i32 [[OFFSET_IDX]], 24 -; VF-FOUR-CHECK-NEXT: [[TMP27:%.*]] = add i32 [[OFFSET_IDX]], 28 -; VF-FOUR-CHECK-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0 -; VF-FOUR-CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 4 -; VF-FOUR-CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 8 -; VF-FOUR-CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 12 -; VF-FOUR-CHECK-NEXT: [[TMP16:%.*]] = add i64 [[INDEX]], 16 -; VF-FOUR-CHECK-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 20 -; VF-FOUR-CHECK-NEXT: [[TMP18:%.*]] = add i64 [[INDEX]], 24 -; VF-FOUR-CHECK-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], 28 -; VF-FOUR-CHECK-NEXT: [[TMP28:%.*]] = xor i32 [[TMP20]], -1 -; VF-FOUR-CHECK-NEXT: [[TMP29:%.*]] = xor i32 [[TMP21]], -1 -; VF-FOUR-CHECK-NEXT: [[TMP30:%.*]] = xor i32 [[TMP22]], -1 -; VF-FOUR-CHECK-NEXT: [[TMP31:%.*]] = xor i32 [[TMP23]], -1 -; VF-FOUR-CHECK-NEXT: [[TMP32:%.*]] = xor i32 [[TMP24]], -1 -; VF-FOUR-CHECK-NEXT: [[TMP33:%.*]] = xor i32 [[TMP25]], -1 -; VF-FOUR-CHECK-NEXT: [[TMP34:%.*]] = xor i32 [[TMP26]], -1 -; VF-FOUR-CHECK-NEXT: [[TMP35:%.*]] = xor i32 [[TMP27]], -1 +; VF-FOUR-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[OFFSET_IDX]], 0 +; VF-FOUR-CHECK-NEXT: [[TMP9:%.*]] = add i32 [[OFFSET_IDX]], 4 +; VF-FOUR-CHECK-NEXT: [[TMP10:%.*]] = add i32 [[OFFSET_IDX]], 8 +; VF-FOUR-CHECK-NEXT: [[TMP11:%.*]] = add i32 [[OFFSET_IDX]], 12 +; VF-FOUR-CHECK-NEXT: [[TMP12:%.*]] = add i32 [[OFFSET_IDX]], 16 +; VF-FOUR-CHECK-NEXT: [[TMP13:%.*]] = add i32 [[OFFSET_IDX]], 20 +; VF-FOUR-CHECK-NEXT: [[TMP14:%.*]] = add i32 [[OFFSET_IDX]], 24 +; VF-FOUR-CHECK-NEXT: [[TMP15:%.*]] = add i32 [[OFFSET_IDX]], 28 +; VF-FOUR-CHECK-NEXT: [[TMP16:%.*]] = add i64 [[INDEX]], 0 +; VF-FOUR-CHECK-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 4 +; VF-FOUR-CHECK-NEXT: [[TMP18:%.*]] = add i64 [[INDEX]], 8 +; VF-FOUR-CHECK-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], 12 +; VF-FOUR-CHECK-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 16 +; VF-FOUR-CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 20 +; VF-FOUR-CHECK-NEXT: [[TMP22:%.*]] = add i64 [[INDEX]], 24 +; VF-FOUR-CHECK-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], 28 +; VF-FOUR-CHECK-NEXT: [[TMP24:%.*]] = xor i32 [[TMP8]], -1 +; VF-FOUR-CHECK-NEXT: [[TMP25:%.*]] = xor i32 [[TMP9]], -1 +; VF-FOUR-CHECK-NEXT: [[TMP26:%.*]] = xor i32 [[TMP10]], -1 +; VF-FOUR-CHECK-NEXT: [[TMP27:%.*]] = xor i32 [[TMP11]], -1 +; VF-FOUR-CHECK-NEXT: [[TMP28:%.*]] = xor i32 [[TMP12]], -1 +; VF-FOUR-CHECK-NEXT: [[TMP29:%.*]] = xor i32 [[TMP13]], -1 +; VF-FOUR-CHECK-NEXT: [[TMP30:%.*]] = xor i32 [[TMP14]], -1 +; VF-FOUR-CHECK-NEXT: [[TMP31:%.*]] = xor i32 [[TMP15]], -1 +; VF-FOUR-CHECK-NEXT: [[TMP32:%.*]] = add i32 [[TMP24]], [[N]] +; VF-FOUR-CHECK-NEXT: [[TMP33:%.*]] = add i32 [[TMP25]], [[N]] +; VF-FOUR-CHECK-NEXT: [[TMP34:%.*]] = add i32 [[TMP26]], [[N]] +; VF-FOUR-CHECK-NEXT: [[TMP35:%.*]] = add i32 [[TMP27]], [[N]] ; VF-FOUR-CHECK-NEXT: [[TMP36:%.*]] = add i32 [[TMP28]], [[N]] ; VF-FOUR-CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP29]], [[N]] ; VF-FOUR-CHECK-NEXT: [[TMP38:%.*]] = add i32 [[TMP30]], [[N]] ; VF-FOUR-CHECK-NEXT: [[TMP39:%.*]] = add i32 [[TMP31]], [[N]] -; VF-FOUR-CHECK-NEXT: [[TMP40:%.*]] = add i32 [[TMP32]], [[N]] -; VF-FOUR-CHECK-NEXT: [[TMP41:%.*]] = add i32 [[TMP33]], [[N]] -; VF-FOUR-CHECK-NEXT: [[TMP42:%.*]] = add i32 [[TMP34]], [[N]] -; VF-FOUR-CHECK-NEXT: [[TMP43:%.*]] = add i32 [[TMP35]], [[N]] +; VF-FOUR-CHECK-NEXT: [[TMP40:%.*]] = sext i32 [[TMP32]] to i64 +; VF-FOUR-CHECK-NEXT: [[TMP41:%.*]] = sext i32 [[TMP33]] to i64 +; VF-FOUR-CHECK-NEXT: [[TMP42:%.*]] = sext i32 [[TMP34]] to i64 +; VF-FOUR-CHECK-NEXT: [[TMP43:%.*]] = sext i32 [[TMP35]] to i64 ; VF-FOUR-CHECK-NEXT: [[TMP44:%.*]] = sext i32 [[TMP36]] to i64 ; VF-FOUR-CHECK-NEXT: [[TMP45:%.*]] = sext i32 [[TMP37]] to i64 ; VF-FOUR-CHECK-NEXT: [[TMP46:%.*]] = sext i32 [[TMP38]] to i64 ; VF-FOUR-CHECK-NEXT: [[TMP47:%.*]] = sext i32 [[TMP39]] to i64 -; VF-FOUR-CHECK-NEXT: [[TMP48:%.*]] = sext i32 [[TMP40]] to i64 -; VF-FOUR-CHECK-NEXT: [[TMP49:%.*]] = sext i32 [[TMP41]] to i64 -; VF-FOUR-CHECK-NEXT: [[TMP50:%.*]] = sext i32 [[TMP42]] to i64 -; VF-FOUR-CHECK-NEXT: [[TMP51:%.*]] = sext i32 [[TMP43]] to i64 -; VF-FOUR-CHECK-NEXT: [[TMP52:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP44]] +; VF-FOUR-CHECK-NEXT: [[TMP48:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP40]] +; VF-FOUR-CHECK-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP41]] +; VF-FOUR-CHECK-NEXT: [[TMP50:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP42]] +; VF-FOUR-CHECK-NEXT: [[TMP51:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP43]] +; VF-FOUR-CHECK-NEXT: [[TMP52:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP44]] ; VF-FOUR-CHECK-NEXT: [[TMP53:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP45]] ; VF-FOUR-CHECK-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP46]] ; VF-FOUR-CHECK-NEXT: [[TMP55:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP47]] -; VF-FOUR-CHECK-NEXT: [[TMP56:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP48]] -; VF-FOUR-CHECK-NEXT: [[TMP57:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP49]] -; VF-FOUR-CHECK-NEXT: [[TMP58:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP50]] -; VF-FOUR-CHECK-NEXT: [[TMP59:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP51]] -; VF-FOUR-CHECK-NEXT: [[TMP60:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 0 -; VF-FOUR-CHECK-NEXT: [[TMP61:%.*]] = getelementptr inbounds float, float* [[TMP60]], i32 -3 -; VF-FOUR-CHECK-NEXT: [[TMP62:%.*]] = bitcast float* [[TMP61]] to <4 x float>* -; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP62]], align 4 +; VF-FOUR-CHECK-NEXT: [[TMP56:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 0 +; VF-FOUR-CHECK-NEXT: [[TMP57:%.*]] = getelementptr inbounds float, float* [[TMP56]], i32 -3 +; VF-FOUR-CHECK-NEXT: [[TMP58:%.*]] = bitcast float* [[TMP57]] to <4 x float>* +; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP58]], align 4 ; VF-FOUR-CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x float> [[WIDE_LOAD]], <4 x float> poison, <4 x i32> -; VF-FOUR-CHECK-NEXT: [[TMP63:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -4 -; VF-FOUR-CHECK-NEXT: [[TMP64:%.*]] = getelementptr inbounds float, float* [[TMP63]], i32 -3 -; VF-FOUR-CHECK-NEXT: [[TMP65:%.*]] = bitcast float* [[TMP64]] to <4 x float>* -; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, <4 x float>* [[TMP65]], align 4 +; VF-FOUR-CHECK-NEXT: [[TMP59:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -4 +; VF-FOUR-CHECK-NEXT: [[TMP60:%.*]] = getelementptr inbounds float, float* [[TMP59]], i32 -3 +; VF-FOUR-CHECK-NEXT: [[TMP61:%.*]] = bitcast float* [[TMP60]] to <4 x float>* +; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, <4 x float>* [[TMP61]], align 4 ; VF-FOUR-CHECK-NEXT: [[REVERSE3:%.*]] = shufflevector <4 x float> [[WIDE_LOAD2]], <4 x float> poison, <4 x i32> -; VF-FOUR-CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -8 -; VF-FOUR-CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds float, float* [[TMP66]], i32 -3 -; VF-FOUR-CHECK-NEXT: [[TMP68:%.*]] = bitcast float* [[TMP67]] to <4 x float>* -; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x float>, <4 x float>* [[TMP68]], align 4 +; VF-FOUR-CHECK-NEXT: [[TMP62:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -8 +; VF-FOUR-CHECK-NEXT: [[TMP63:%.*]] = getelementptr inbounds float, float* [[TMP62]], i32 -3 +; VF-FOUR-CHECK-NEXT: [[TMP64:%.*]] = bitcast float* [[TMP63]] to <4 x float>* +; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x float>, <4 x float>* [[TMP64]], align 4 ; VF-FOUR-CHECK-NEXT: [[REVERSE5:%.*]] = shufflevector <4 x float> [[WIDE_LOAD4]], <4 x float> poison, <4 x i32> -; VF-FOUR-CHECK-NEXT: [[TMP69:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -12 -; VF-FOUR-CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds float, float* [[TMP69]], i32 -3 -; VF-FOUR-CHECK-NEXT: [[TMP71:%.*]] = bitcast float* [[TMP70]] to <4 x float>* -; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x float>, <4 x float>* [[TMP71]], align 4 +; VF-FOUR-CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -12 +; VF-FOUR-CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds float, float* [[TMP65]], i32 -3 +; VF-FOUR-CHECK-NEXT: [[TMP67:%.*]] = bitcast float* [[TMP66]] to <4 x float>* +; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x float>, <4 x float>* [[TMP67]], align 4 ; VF-FOUR-CHECK-NEXT: [[REVERSE7:%.*]] = shufflevector <4 x float> [[WIDE_LOAD6]], <4 x float> poison, <4 x i32> -; VF-FOUR-CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -16 -; VF-FOUR-CHECK-NEXT: [[TMP73:%.*]] = getelementptr inbounds float, float* [[TMP72]], i32 -3 -; VF-FOUR-CHECK-NEXT: [[TMP74:%.*]] = bitcast float* [[TMP73]] to <4 x float>* -; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x float>, <4 x float>* [[TMP74]], align 4 +; VF-FOUR-CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -16 +; VF-FOUR-CHECK-NEXT: [[TMP69:%.*]] = getelementptr inbounds float, float* [[TMP68]], i32 -3 +; VF-FOUR-CHECK-NEXT: [[TMP70:%.*]] = bitcast float* [[TMP69]] to <4 x float>* +; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x float>, <4 x float>* [[TMP70]], align 4 ; VF-FOUR-CHECK-NEXT: [[REVERSE9:%.*]] = shufflevector <4 x float> [[WIDE_LOAD8]], <4 x float> poison, <4 x i32> -; VF-FOUR-CHECK-NEXT: [[TMP75:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -20 -; VF-FOUR-CHECK-NEXT: [[TMP76:%.*]] = getelementptr inbounds float, float* [[TMP75]], i32 -3 -; VF-FOUR-CHECK-NEXT: [[TMP77:%.*]] = bitcast float* [[TMP76]] to <4 x float>* -; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x float>, <4 x float>* [[TMP77]], align 4 +; VF-FOUR-CHECK-NEXT: [[TMP71:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -20 +; VF-FOUR-CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds float, float* [[TMP71]], i32 -3 +; VF-FOUR-CHECK-NEXT: [[TMP73:%.*]] = bitcast float* [[TMP72]] to <4 x float>* +; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x float>, <4 x float>* [[TMP73]], align 4 ; VF-FOUR-CHECK-NEXT: [[REVERSE11:%.*]] = shufflevector <4 x float> [[WIDE_LOAD10]], <4 x float> poison, <4 x i32> -; VF-FOUR-CHECK-NEXT: [[TMP78:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -24 -; VF-FOUR-CHECK-NEXT: [[TMP79:%.*]] = getelementptr inbounds float, float* [[TMP78]], i32 -3 -; VF-FOUR-CHECK-NEXT: [[TMP80:%.*]] = bitcast float* [[TMP79]] to <4 x float>* -; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, <4 x float>* [[TMP80]], align 4 +; VF-FOUR-CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -24 +; VF-FOUR-CHECK-NEXT: [[TMP75:%.*]] = getelementptr inbounds float, float* [[TMP74]], i32 -3 +; VF-FOUR-CHECK-NEXT: [[TMP76:%.*]] = bitcast float* [[TMP75]] to <4 x float>* +; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, <4 x float>* [[TMP76]], align 4 ; VF-FOUR-CHECK-NEXT: [[REVERSE13:%.*]] = shufflevector <4 x float> [[WIDE_LOAD12]], <4 x float> poison, <4 x i32> -; VF-FOUR-CHECK-NEXT: [[TMP81:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -28 -; VF-FOUR-CHECK-NEXT: [[TMP82:%.*]] = getelementptr inbounds float, float* [[TMP81]], i32 -3 -; VF-FOUR-CHECK-NEXT: [[TMP83:%.*]] = bitcast float* [[TMP82]] to <4 x float>* -; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x float>, <4 x float>* [[TMP83]], align 4 +; VF-FOUR-CHECK-NEXT: [[TMP77:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -28 +; VF-FOUR-CHECK-NEXT: [[TMP78:%.*]] = getelementptr inbounds float, float* [[TMP77]], i32 -3 +; VF-FOUR-CHECK-NEXT: [[TMP79:%.*]] = bitcast float* [[TMP78]] to <4 x float>* +; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x float>, <4 x float>* [[TMP79]], align 4 ; VF-FOUR-CHECK-NEXT: [[REVERSE15:%.*]] = shufflevector <4 x float> [[WIDE_LOAD14]], <4 x float> poison, <4 x i32> -; VF-FOUR-CHECK-NEXT: [[TMP84:%.*]] = fadd fast <4 x float> [[REVERSE]], -; VF-FOUR-CHECK-NEXT: [[TMP85:%.*]] = fadd fast <4 x float> [[REVERSE3]], -; VF-FOUR-CHECK-NEXT: [[TMP86:%.*]] = fadd fast <4 x float> [[REVERSE5]], -; VF-FOUR-CHECK-NEXT: [[TMP87:%.*]] = fadd fast <4 x float> [[REVERSE7]], -; VF-FOUR-CHECK-NEXT: [[TMP88:%.*]] = fadd fast <4 x float> [[REVERSE9]], -; VF-FOUR-CHECK-NEXT: [[TMP89:%.*]] = fadd fast <4 x float> [[REVERSE11]], -; VF-FOUR-CHECK-NEXT: [[TMP90:%.*]] = fadd fast <4 x float> [[REVERSE13]], -; VF-FOUR-CHECK-NEXT: [[TMP91:%.*]] = fadd fast <4 x float> [[REVERSE15]], -; VF-FOUR-CHECK-NEXT: [[TMP92:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP12]] -; VF-FOUR-CHECK-NEXT: [[TMP93:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP13]] -; VF-FOUR-CHECK-NEXT: [[TMP94:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP14]] -; VF-FOUR-CHECK-NEXT: [[TMP95:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP15]] -; VF-FOUR-CHECK-NEXT: [[TMP96:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP16]] -; VF-FOUR-CHECK-NEXT: [[TMP97:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP17]] -; VF-FOUR-CHECK-NEXT: [[TMP98:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP18]] -; VF-FOUR-CHECK-NEXT: [[TMP99:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP19]] -; VF-FOUR-CHECK-NEXT: [[TMP100:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 0 +; VF-FOUR-CHECK-NEXT: [[TMP80:%.*]] = fadd fast <4 x float> [[REVERSE]], +; VF-FOUR-CHECK-NEXT: [[TMP81:%.*]] = fadd fast <4 x float> [[REVERSE3]], +; VF-FOUR-CHECK-NEXT: [[TMP82:%.*]] = fadd fast <4 x float> [[REVERSE5]], +; VF-FOUR-CHECK-NEXT: [[TMP83:%.*]] = fadd fast <4 x float> [[REVERSE7]], +; VF-FOUR-CHECK-NEXT: [[TMP84:%.*]] = fadd fast <4 x float> [[REVERSE9]], +; VF-FOUR-CHECK-NEXT: [[TMP85:%.*]] = fadd fast <4 x float> [[REVERSE11]], +; VF-FOUR-CHECK-NEXT: [[TMP86:%.*]] = fadd fast <4 x float> [[REVERSE13]], +; VF-FOUR-CHECK-NEXT: [[TMP87:%.*]] = fadd fast <4 x float> [[REVERSE15]], +; VF-FOUR-CHECK-NEXT: [[TMP88:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP16]] +; VF-FOUR-CHECK-NEXT: [[TMP89:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP17]] +; VF-FOUR-CHECK-NEXT: [[TMP90:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP18]] +; VF-FOUR-CHECK-NEXT: [[TMP91:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP19]] +; VF-FOUR-CHECK-NEXT: [[TMP92:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP20]] +; VF-FOUR-CHECK-NEXT: [[TMP93:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP21]] +; VF-FOUR-CHECK-NEXT: [[TMP94:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP22]] +; VF-FOUR-CHECK-NEXT: [[TMP95:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP23]] +; VF-FOUR-CHECK-NEXT: [[TMP96:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 0 +; VF-FOUR-CHECK-NEXT: [[TMP97:%.*]] = bitcast float* [[TMP96]] to <4 x float>* +; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP80]], <4 x float>* [[TMP97]], align 4 +; VF-FOUR-CHECK-NEXT: [[TMP98:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 4 +; VF-FOUR-CHECK-NEXT: [[TMP99:%.*]] = bitcast float* [[TMP98]] to <4 x float>* +; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP81]], <4 x float>* [[TMP99]], align 4 +; VF-FOUR-CHECK-NEXT: [[TMP100:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 8 ; VF-FOUR-CHECK-NEXT: [[TMP101:%.*]] = bitcast float* [[TMP100]] to <4 x float>* -; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP84]], <4 x float>* [[TMP101]], align 4 -; VF-FOUR-CHECK-NEXT: [[TMP102:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 4 +; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP82]], <4 x float>* [[TMP101]], align 4 +; VF-FOUR-CHECK-NEXT: [[TMP102:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 12 ; VF-FOUR-CHECK-NEXT: [[TMP103:%.*]] = bitcast float* [[TMP102]] to <4 x float>* -; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP85]], <4 x float>* [[TMP103]], align 4 -; VF-FOUR-CHECK-NEXT: [[TMP104:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 8 +; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP83]], <4 x float>* [[TMP103]], align 4 +; VF-FOUR-CHECK-NEXT: [[TMP104:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 16 ; VF-FOUR-CHECK-NEXT: [[TMP105:%.*]] = bitcast float* [[TMP104]] to <4 x float>* -; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP86]], <4 x float>* [[TMP105]], align 4 -; VF-FOUR-CHECK-NEXT: [[TMP106:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 12 +; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP84]], <4 x float>* [[TMP105]], align 4 +; VF-FOUR-CHECK-NEXT: [[TMP106:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 20 ; VF-FOUR-CHECK-NEXT: [[TMP107:%.*]] = bitcast float* [[TMP106]] to <4 x float>* -; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP87]], <4 x float>* [[TMP107]], align 4 -; VF-FOUR-CHECK-NEXT: [[TMP108:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 16 +; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP85]], <4 x float>* [[TMP107]], align 4 +; VF-FOUR-CHECK-NEXT: [[TMP108:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 24 ; VF-FOUR-CHECK-NEXT: [[TMP109:%.*]] = bitcast float* [[TMP108]] to <4 x float>* -; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP88]], <4 x float>* [[TMP109]], align 4 -; VF-FOUR-CHECK-NEXT: [[TMP110:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 20 +; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP86]], <4 x float>* [[TMP109]], align 4 +; VF-FOUR-CHECK-NEXT: [[TMP110:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 28 ; VF-FOUR-CHECK-NEXT: [[TMP111:%.*]] = bitcast float* [[TMP110]] to <4 x float>* -; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP89]], <4 x float>* [[TMP111]], align 4 -; VF-FOUR-CHECK-NEXT: [[TMP112:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 24 -; VF-FOUR-CHECK-NEXT: [[TMP113:%.*]] = bitcast float* [[TMP112]] to <4 x float>* -; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP90]], <4 x float>* [[TMP113]], align 4 -; VF-FOUR-CHECK-NEXT: [[TMP114:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 28 -; VF-FOUR-CHECK-NEXT: [[TMP115:%.*]] = bitcast float* [[TMP114]] to <4 x float>* -; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP91]], <4 x float>* [[TMP115]], align 4 +; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP87]], <4 x float>* [[TMP111]], align 4 ; VF-FOUR-CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; VF-FOUR-CHECK-NEXT: [[TMP116:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VF-FOUR-CHECK-NEXT: br i1 [[TMP116]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOPID_MV_CM:![0-9]+]] +; VF-FOUR-CHECK-NEXT: [[TMP112:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VF-FOUR-CHECK-NEXT: br i1 [[TMP112]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOPID_MV_CM:![0-9]+]] ; VF-FOUR-CHECK: middle.block: ; VF-FOUR-CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; VF-FOUR-CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; VF-FOUR-CHECK: vec.epilog.iter.check: -; VF-FOUR-CHECK-NEXT: [[IND_END21:%.*]] = trunc i64 [[N_VEC]] to i32 +; VF-FOUR-CHECK-NEXT: [[IND_END19:%.*]] = trunc i64 [[N_VEC]] to i32 ; VF-FOUR-CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; VF-FOUR-CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4 ; VF-FOUR-CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] @@ -929,49 +929,49 @@ define dso_local signext i32 @f2(float* noalias %A, float* noalias %B, i32 signe ; VF-FOUR-CHECK-NEXT: [[IND_END:%.*]] = trunc i64 [[N_VEC17]] to i32 ; VF-FOUR-CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; VF-FOUR-CHECK: vec.epilog.vector.body: -; VF-FOUR-CHECK-NEXT: [[INDEX18:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT19:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; VF-FOUR-CHECK-NEXT: [[OFFSET_IDX23:%.*]] = trunc i64 [[INDEX18]] to i32 -; VF-FOUR-CHECK-NEXT: [[TMP118:%.*]] = add i32 [[OFFSET_IDX23]], 0 -; VF-FOUR-CHECK-NEXT: [[TMP117:%.*]] = add i64 [[INDEX18]], 0 -; VF-FOUR-CHECK-NEXT: [[TMP119:%.*]] = xor i32 [[TMP118]], -1 -; VF-FOUR-CHECK-NEXT: [[TMP120:%.*]] = add i32 [[TMP119]], [[N]] -; VF-FOUR-CHECK-NEXT: [[TMP121:%.*]] = sext i32 [[TMP120]] to i64 -; VF-FOUR-CHECK-NEXT: [[TMP122:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP121]] -; VF-FOUR-CHECK-NEXT: [[TMP123:%.*]] = getelementptr inbounds float, float* [[TMP122]], i32 0 -; VF-FOUR-CHECK-NEXT: [[TMP124:%.*]] = getelementptr inbounds float, float* [[TMP123]], i32 -3 -; VF-FOUR-CHECK-NEXT: [[TMP125:%.*]] = bitcast float* [[TMP124]] to <4 x float>* -; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD24:%.*]] = load <4 x float>, <4 x float>* [[TMP125]], align 4 +; VF-FOUR-CHECK-NEXT: [[OFFSET_IDX23:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT26:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; VF-FOUR-CHECK-NEXT: [[OFFSET_IDX22:%.*]] = trunc i64 [[OFFSET_IDX23]] to i32 +; VF-FOUR-CHECK-NEXT: [[TMP113:%.*]] = add i32 [[OFFSET_IDX22]], 0 +; VF-FOUR-CHECK-NEXT: [[TMP114:%.*]] = add i64 [[OFFSET_IDX23]], 0 +; VF-FOUR-CHECK-NEXT: [[TMP115:%.*]] = xor i32 [[TMP113]], -1 +; VF-FOUR-CHECK-NEXT: [[TMP116:%.*]] = add i32 [[TMP115]], [[N]] +; VF-FOUR-CHECK-NEXT: [[TMP117:%.*]] = sext i32 [[TMP116]] to i64 +; VF-FOUR-CHECK-NEXT: [[TMP118:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP117]] +; VF-FOUR-CHECK-NEXT: [[TMP119:%.*]] = getelementptr inbounds float, float* [[TMP118]], i32 0 +; VF-FOUR-CHECK-NEXT: [[TMP120:%.*]] = getelementptr inbounds float, float* [[TMP119]], i32 -3 +; VF-FOUR-CHECK-NEXT: [[TMP121:%.*]] = bitcast float* [[TMP120]] to <4 x float>* +; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD24:%.*]] = load <4 x float>, <4 x float>* [[TMP121]], align 4 ; VF-FOUR-CHECK-NEXT: [[REVERSE25:%.*]] = shufflevector <4 x float> [[WIDE_LOAD24]], <4 x float> poison, <4 x i32> -; VF-FOUR-CHECK-NEXT: [[TMP126:%.*]] = fadd fast <4 x float> [[REVERSE25]], -; VF-FOUR-CHECK-NEXT: [[TMP127:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP117]] -; VF-FOUR-CHECK-NEXT: [[TMP128:%.*]] = getelementptr inbounds float, float* [[TMP127]], i32 0 -; VF-FOUR-CHECK-NEXT: [[TMP129:%.*]] = bitcast float* [[TMP128]] to <4 x float>* -; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP126]], <4 x float>* [[TMP129]], align 4 -; VF-FOUR-CHECK-NEXT: [[INDEX_NEXT19]] = add nuw i64 [[INDEX18]], 4 -; VF-FOUR-CHECK-NEXT: [[TMP130:%.*]] = icmp eq i64 [[INDEX_NEXT19]], [[N_VEC17]] -; VF-FOUR-CHECK-NEXT: br i1 [[TMP130]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOPID_EV_CM:![0-9]+]] +; VF-FOUR-CHECK-NEXT: [[TMP122:%.*]] = fadd fast <4 x float> [[REVERSE25]], +; VF-FOUR-CHECK-NEXT: [[TMP123:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP114]] +; VF-FOUR-CHECK-NEXT: [[TMP124:%.*]] = getelementptr inbounds float, float* [[TMP123]], i32 0 +; VF-FOUR-CHECK-NEXT: [[TMP125:%.*]] = bitcast float* [[TMP124]] to <4 x float>* +; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP122]], <4 x float>* [[TMP125]], align 4 +; VF-FOUR-CHECK-NEXT: [[INDEX_NEXT26]] = add nuw i64 [[OFFSET_IDX23]], 4 +; VF-FOUR-CHECK-NEXT: [[TMP126:%.*]] = icmp eq i64 [[INDEX_NEXT26]], [[N_VEC17]] +; VF-FOUR-CHECK-NEXT: br i1 [[TMP126]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOPID_EV_CM:![0-9]+]] ; VF-FOUR-CHECK: vec.epilog.middle.block: -; VF-FOUR-CHECK-NEXT: [[CMP_N22:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC17]] -; VF-FOUR-CHECK-NEXT: br i1 [[CMP_N22]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]] +; VF-FOUR-CHECK-NEXT: [[CMP_N20:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC17]] +; VF-FOUR-CHECK-NEXT: br i1 [[CMP_N20]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]] ; VF-FOUR-CHECK: vec.epilog.scalar.ph: ; VF-FOUR-CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC17]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ] -; VF-FOUR-CHECK-NEXT: [[BC_RESUME_VAL20:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END21]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ] +; VF-FOUR-CHECK-NEXT: [[BC_RESUME_VAL18:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END19]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ] ; VF-FOUR-CHECK-NEXT: br label [[FOR_BODY:%.*]] ; VF-FOUR-CHECK: for.body: ; VF-FOUR-CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; VF-FOUR-CHECK-NEXT: [[I_014:%.*]] = phi i32 [ [[BC_RESUME_VAL20]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] -; VF-FOUR-CHECK-NEXT: [[TMP131:%.*]] = xor i32 [[I_014]], -1 -; VF-FOUR-CHECK-NEXT: [[SUB2:%.*]] = add i32 [[TMP131]], [[N]] +; VF-FOUR-CHECK-NEXT: [[I_014:%.*]] = phi i32 [ [[BC_RESUME_VAL18]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; VF-FOUR-CHECK-NEXT: [[TMP127:%.*]] = xor i32 [[I_014]], -1 +; VF-FOUR-CHECK-NEXT: [[SUB2:%.*]] = add i32 [[TMP127]], [[N]] ; VF-FOUR-CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[SUB2]] to i64 ; VF-FOUR-CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IDXPROM]] -; VF-FOUR-CHECK-NEXT: [[TMP132:%.*]] = load float, float* [[ARRAYIDX]], align 4 -; VF-FOUR-CHECK-NEXT: [[CONV3:%.*]] = fadd fast float [[TMP132]], 1.000000e+00 +; VF-FOUR-CHECK-NEXT: [[TMP128:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; VF-FOUR-CHECK-NEXT: [[CONV3:%.*]] = fadd fast float [[TMP128]], 1.000000e+00 ; VF-FOUR-CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]] ; VF-FOUR-CHECK-NEXT: store float [[CONV3]], float* [[ARRAYIDX5]], align 4 ; VF-FOUR-CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; VF-FOUR-CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_014]], 1 ; VF-FOUR-CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; VF-FOUR-CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOPID_MS_CM:![0-9]+]] +; VF-FOUR-CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]] ; VF-FOUR-CHECK: for.end.loopexit.loopexit: ; VF-FOUR-CHECK-NEXT: br label [[FOR_END_LOOPEXIT]] ; VF-FOUR-CHECK: for.end.loopexit: diff --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll index e33be35..313c7a8 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-load-gather.ll @@ -50,29 +50,29 @@ define i32 @inv_load_conditional(i32* %a, i64 %n, i32* %b, i32 %k) { ; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK_NOT_NOT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] ; CHECK: vec.epilog.ph: ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[N_VEC13:%.*]] = and i64 [[SMAX6]], 9223372036854775800 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT17:%.*]] = insertelement <8 x i32*> poison, i32* [[A]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT18:%.*]] = shufflevector <8 x i32*> [[BROADCAST_SPLATINSERT17]], <8 x i32*> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[BROADCAST_SPLATINSERT19:%.*]] = insertelement <8 x i32> poison, i32 [[NTRUNC]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT20:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT19]], <8 x i32> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[N_VEC12:%.*]] = and i64 [[SMAX6]], 9223372036854775800 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT16:%.*]] = insertelement <8 x i32*> poison, i32* [[A]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT17:%.*]] = shufflevector <8 x i32*> [[BROADCAST_SPLATINSERT16]], <8 x i32*> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT18:%.*]] = insertelement <8 x i32> poison, i32 [[NTRUNC]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT19:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT18]], <8 x i32> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT23:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT22:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <8 x i32>* -; CHECK-NEXT: store <8 x i32> [[BROADCAST_SPLAT20]], <8 x i32>* [[TMP6]], align 4 -; CHECK-NEXT: [[INDEX_NEXT23]] = add nuw i64 [[OFFSET_IDX]], 8 -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT23]], [[N_VEC13]] +; CHECK-NEXT: store <8 x i32> [[BROADCAST_SPLAT19]], <8 x i32>* [[TMP6]], align 4 +; CHECK-NEXT: [[INDEX_NEXT22]] = add nuw i64 [[OFFSET_IDX]], 8 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT22]], [[N_VEC12]] ; CHECK-NEXT: br i1 [[TMP7]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <8 x i32*> [[BROADCAST_SPLAT18]], zeroinitializer -; CHECK-NEXT: [[WIDE_MASKED_GATHER21:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[BROADCAST_SPLAT18]], i32 4, <8 x i1> [[TMP8]], <8 x i32> undef) -; CHECK-NEXT: [[PREDPHI22:%.*]] = select <8 x i1> [[TMP8]], <8 x i32> [[WIDE_MASKED_GATHER21]], <8 x i32> -; CHECK-NEXT: [[CMP_N14:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC13]] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[PREDPHI22]], i64 7 -; CHECK-NEXT: br i1 [[CMP_N14]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <8 x i32*> [[BROADCAST_SPLAT17]], zeroinitializer +; CHECK-NEXT: [[WIDE_MASKED_GATHER20:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[BROADCAST_SPLAT17]], i32 4, <8 x i1> [[TMP8]], <8 x i32> undef) +; CHECK-NEXT: [[PREDPHI21:%.*]] = select <8 x i1> [[TMP8]], <8 x i32> [[WIDE_MASKED_GATHER20]], <8 x i32> +; CHECK-NEXT: [[CMP_N13:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC12]] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[PREDPHI21]], i64 7 +; CHECK-NEXT: br i1 [[CMP_N13]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC12]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ] diff --git a/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll b/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll index 2cfa1de..19e10b3 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/limit-vf-by-tripcount.ll @@ -36,22 +36,22 @@ define void @test1(i8 * noalias %src, i8 * noalias %dst) #0 { ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX1]], 0 +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT4:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 0 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP9]], i32 0 ; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <8 x i8>* -; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i8>, <8 x i8>* [[TMP11]], align 64 +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, <8 x i8>* [[TMP11]], align 64 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, i8* [[TMP12]], i32 0 ; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to <8 x i8>* -; CHECK-NEXT: store <8 x i8> [[WIDE_LOAD4]], <8 x i8>* [[TMP14]], align 64 -; CHECK-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 8 -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 16 +; CHECK-NEXT: store <8 x i8> [[WIDE_LOAD3]], <8 x i8>* [[TMP14]], align 64 +; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[OFFSET_IDX]], 8 +; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 16 ; CHECK-NEXT: br i1 [[TMP15]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[CMP_N3:%.*]] = icmp eq i64 17, 16 -; CHECK-NEXT: br i1 [[CMP_N3]], label [[EXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK-NEXT: [[CMP_N1:%.*]] = icmp eq i64 17, 16 +; CHECK-NEXT: br i1 [[CMP_N1]], label [[EXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]] ; CHECK: vec.epilog.scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[LOOP_MEMCPY_EXPANSION:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll index e48fb15..4014f9e 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll @@ -35,24 +35,24 @@ define void @foo1(i32* nocapture %A, i32* nocapture readonly %B, i32* nocapture ; AVX1-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX1: vector.body: ; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AVX1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; AVX1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]] -; AVX1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 -; AVX1-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <8 x i32>* -; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 4 -; AVX1-NEXT: [[TMP4:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], -; AVX1-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP0]] -; AVX1-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP5]], i32 0 -; AVX1-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP7]], i32 4, <8 x i1> [[TMP4]], <8 x i32> poison) -; AVX1-NEXT: [[TMP8:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] -; AVX1-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP0]] -; AVX1-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 0 -; AVX1-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>* -; AVX1-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP8]], <8 x i32>* [[TMP11]], i32 4, <8 x i1> [[TMP4]]) +; AVX1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 +; AVX1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]] +; AVX1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0 +; AVX1-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>* +; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP5]], align 4 +; AVX1-NEXT: [[TMP6:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], +; AVX1-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP2]] +; AVX1-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP7]], i32 0 +; AVX1-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>* +; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP9]], i32 4, <8 x i1> [[TMP6]], <8 x i32> poison) +; AVX1-NEXT: [[TMP10:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] +; AVX1-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP2]] +; AVX1-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP11]], i32 0 +; AVX1-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>* +; AVX1-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP10]], <8 x i32>* [[TMP13]], i32 4, <8 x i1> [[TMP6]]) ; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; AVX1-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 -; AVX1-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; AVX1-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 +; AVX1-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; AVX1: middle.block: ; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 10000 ; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -62,20 +62,20 @@ define void @foo1(i32* nocapture %A, i32* nocapture readonly %B, i32* nocapture ; AVX1: for.body: ; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX1-NEXT: [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; AVX1-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP13]], 100 +; AVX1-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; AVX1-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP15]], 100 ; AVX1-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; AVX1: if.then: ; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] -; AVX1-NEXT: [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4 -; AVX1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP14]], [[TMP13]] +; AVX1-NEXT: [[TMP16:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4 +; AVX1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP15]] ; AVX1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] ; AVX1-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX7]], align 4 ; AVX1-NEXT: br label [[FOR_INC]] ; AVX1: for.inc: ; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; AVX1: for.end: ; AVX1-NEXT: ret void ; @@ -96,69 +96,69 @@ define void @foo1(i32* nocapture %A, i32* nocapture readonly %B, i32* nocapture ; AVX2-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX2: vector.body: ; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AVX2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; AVX2-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8 -; AVX2-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 16 -; AVX2-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 24 -; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]] -; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP1]] +; AVX2-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 +; AVX2-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 8 +; AVX2-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 16 +; AVX2-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 24 ; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]] ; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]] -; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 -; AVX2-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4 -; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 8 +; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP4]] +; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP5]] +; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0 ; AVX2-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, <8 x i32>* [[TMP11]], align 4 -; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 16 +; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP11]], align 4 +; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 8 ; AVX2-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4 -; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 24 +; AVX2-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4 +; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 16 ; AVX2-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_LOAD14:%.*]] = load <8 x i32>, <8 x i32>* [[TMP15]], align 4 -; AVX2-NEXT: [[TMP16:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], -; AVX2-NEXT: [[TMP17:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], -; AVX2-NEXT: [[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], -; AVX2-NEXT: [[TMP19:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD14]], -; AVX2-NEXT: [[TMP20:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP0]] -; AVX2-NEXT: [[TMP21:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP1]] +; AVX2-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i32>, <8 x i32>* [[TMP15]], align 4 +; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 24 +; AVX2-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <8 x i32>* +; AVX2-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x i32>, <8 x i32>* [[TMP17]], align 4 +; AVX2-NEXT: [[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], +; AVX2-NEXT: [[TMP19:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD5]], +; AVX2-NEXT: [[TMP20:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD6]], +; AVX2-NEXT: [[TMP21:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD7]], ; AVX2-NEXT: [[TMP22:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP2]] ; AVX2-NEXT: [[TMP23:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP3]] -; AVX2-NEXT: [[TMP24:%.*]] = getelementptr i32, i32* [[TMP20]], i32 0 -; AVX2-NEXT: [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP25]], i32 4, <8 x i1> [[TMP16]], <8 x i32> poison) -; AVX2-NEXT: [[TMP26:%.*]] = getelementptr i32, i32* [[TMP20]], i32 8 +; AVX2-NEXT: [[TMP24:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP4]] +; AVX2-NEXT: [[TMP25:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP5]] +; AVX2-NEXT: [[TMP26:%.*]] = getelementptr i32, i32* [[TMP22]], i32 0 ; AVX2-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP27]], i32 4, <8 x i1> [[TMP17]], <8 x i32> poison) -; AVX2-NEXT: [[TMP28:%.*]] = getelementptr i32, i32* [[TMP20]], i32 16 +; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP27]], i32 4, <8 x i1> [[TMP18]], <8 x i32> poison) +; AVX2-NEXT: [[TMP28:%.*]] = getelementptr i32, i32* [[TMP22]], i32 8 ; AVX2-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP29]], i32 4, <8 x i1> [[TMP18]], <8 x i32> poison) -; AVX2-NEXT: [[TMP30:%.*]] = getelementptr i32, i32* [[TMP20]], i32 24 +; AVX2-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP29]], i32 4, <8 x i1> [[TMP19]], <8 x i32> poison) +; AVX2-NEXT: [[TMP30:%.*]] = getelementptr i32, i32* [[TMP22]], i32 16 ; AVX2-NEXT: [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP31]], i32 4, <8 x i1> [[TMP19]], <8 x i32> poison) -; AVX2-NEXT: [[TMP32:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] -; AVX2-NEXT: [[TMP33:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD15]], [[WIDE_LOAD12]] -; AVX2-NEXT: [[TMP34:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD16]], [[WIDE_LOAD13]] -; AVX2-NEXT: [[TMP35:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD17]], [[WIDE_LOAD14]] -; AVX2-NEXT: [[TMP36:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP0]] -; AVX2-NEXT: [[TMP37:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP1]] +; AVX2-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP31]], i32 4, <8 x i1> [[TMP20]], <8 x i32> poison) +; AVX2-NEXT: [[TMP32:%.*]] = getelementptr i32, i32* [[TMP22]], i32 24 +; AVX2-NEXT: [[TMP33:%.*]] = bitcast i32* [[TMP32]] to <8 x i32>* +; AVX2-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP33]], i32 4, <8 x i1> [[TMP21]], <8 x i32> poison) +; AVX2-NEXT: [[TMP34:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] +; AVX2-NEXT: [[TMP35:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD8]], [[WIDE_LOAD5]] +; AVX2-NEXT: [[TMP36:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_LOAD6]] +; AVX2-NEXT: [[TMP37:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD10]], [[WIDE_LOAD7]] ; AVX2-NEXT: [[TMP38:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP2]] ; AVX2-NEXT: [[TMP39:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP3]] -; AVX2-NEXT: [[TMP40:%.*]] = getelementptr i32, i32* [[TMP36]], i32 0 -; AVX2-NEXT: [[TMP41:%.*]] = bitcast i32* [[TMP40]] to <8 x i32>* -; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP32]], <8 x i32>* [[TMP41]], i32 4, <8 x i1> [[TMP16]]) -; AVX2-NEXT: [[TMP42:%.*]] = getelementptr i32, i32* [[TMP36]], i32 8 +; AVX2-NEXT: [[TMP40:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP4]] +; AVX2-NEXT: [[TMP41:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP5]] +; AVX2-NEXT: [[TMP42:%.*]] = getelementptr i32, i32* [[TMP38]], i32 0 ; AVX2-NEXT: [[TMP43:%.*]] = bitcast i32* [[TMP42]] to <8 x i32>* -; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP33]], <8 x i32>* [[TMP43]], i32 4, <8 x i1> [[TMP17]]) -; AVX2-NEXT: [[TMP44:%.*]] = getelementptr i32, i32* [[TMP36]], i32 16 +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP34]], <8 x i32>* [[TMP43]], i32 4, <8 x i1> [[TMP18]]) +; AVX2-NEXT: [[TMP44:%.*]] = getelementptr i32, i32* [[TMP38]], i32 8 ; AVX2-NEXT: [[TMP45:%.*]] = bitcast i32* [[TMP44]] to <8 x i32>* -; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP34]], <8 x i32>* [[TMP45]], i32 4, <8 x i1> [[TMP18]]) -; AVX2-NEXT: [[TMP46:%.*]] = getelementptr i32, i32* [[TMP36]], i32 24 +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP35]], <8 x i32>* [[TMP45]], i32 4, <8 x i1> [[TMP19]]) +; AVX2-NEXT: [[TMP46:%.*]] = getelementptr i32, i32* [[TMP38]], i32 16 ; AVX2-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <8 x i32>* -; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP35]], <8 x i32>* [[TMP47]], i32 4, <8 x i1> [[TMP19]]) +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP36]], <8 x i32>* [[TMP47]], i32 4, <8 x i1> [[TMP20]]) +; AVX2-NEXT: [[TMP48:%.*]] = getelementptr i32, i32* [[TMP38]], i32 24 +; AVX2-NEXT: [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <8 x i32>* +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP37]], <8 x i32>* [[TMP49]], i32 4, <8 x i1> [[TMP21]]) ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; AVX2-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 -; AVX2-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; AVX2-NEXT: [[TMP50:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 +; AVX2-NEXT: br i1 [[TMP50]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; AVX2: middle.block: ; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 9984 ; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -168,20 +168,20 @@ define void @foo1(i32* nocapture %A, i32* nocapture readonly %B, i32* nocapture ; AVX2: for.body: ; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX2-NEXT: [[TMP49:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; AVX2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP49]], 100 +; AVX2-NEXT: [[TMP51:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; AVX2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP51]], 100 ; AVX2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; AVX2: if.then: ; AVX2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] -; AVX2-NEXT: [[TMP50:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4 -; AVX2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP50]], [[TMP49]] +; AVX2-NEXT: [[TMP52:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4 +; AVX2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP52]], [[TMP51]] ; AVX2-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] ; AVX2-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX7]], align 4 ; AVX2-NEXT: br label [[FOR_INC]] ; AVX2: for.inc: ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; AVX2: for.end: ; AVX2-NEXT: ret void ; @@ -204,69 +204,69 @@ define void @foo1(i32* nocapture %A, i32* nocapture readonly %B, i32* nocapture ; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX512: vector.body: ; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AVX512-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; AVX512-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 16 -; AVX512-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 32 -; AVX512-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 48 -; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]] -; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP1]] +; AVX512-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 +; AVX512-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 16 +; AVX512-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 32 +; AVX512-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 48 ; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]] ; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]] -; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 -; AVX512-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP9]], align 4 -; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 16 +; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP4]] +; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP5]] +; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0 ; AVX512-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i32>, <16 x i32>* [[TMP11]], align 4 -; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 32 +; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP11]], align 4 +; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 16 ; AVX512-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_LOAD13:%.*]] = load <16 x i32>, <16 x i32>* [[TMP13]], align 4 -; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 48 +; AVX512-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i32>, <16 x i32>* [[TMP13]], align 4 +; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 32 ; AVX512-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_LOAD14:%.*]] = load <16 x i32>, <16 x i32>* [[TMP15]], align 4 -; AVX512-NEXT: [[TMP16:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], -; AVX512-NEXT: [[TMP17:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD12]], -; AVX512-NEXT: [[TMP18:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD13]], -; AVX512-NEXT: [[TMP19:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD14]], -; AVX512-NEXT: [[TMP20:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP0]] -; AVX512-NEXT: [[TMP21:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP1]] +; AVX512-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i32>, <16 x i32>* [[TMP15]], align 4 +; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 48 +; AVX512-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <16 x i32>* +; AVX512-NEXT: [[WIDE_LOAD7:%.*]] = load <16 x i32>, <16 x i32>* [[TMP17]], align 4 +; AVX512-NEXT: [[TMP18:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], +; AVX512-NEXT: [[TMP19:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD5]], +; AVX512-NEXT: [[TMP20:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD6]], +; AVX512-NEXT: [[TMP21:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD7]], ; AVX512-NEXT: [[TMP22:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP2]] ; AVX512-NEXT: [[TMP23:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP3]] -; AVX512-NEXT: [[TMP24:%.*]] = getelementptr i32, i32* [[TMP20]], i32 0 -; AVX512-NEXT: [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP25]], i32 4, <16 x i1> [[TMP16]], <16 x i32> poison) -; AVX512-NEXT: [[TMP26:%.*]] = getelementptr i32, i32* [[TMP20]], i32 16 +; AVX512-NEXT: [[TMP24:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP4]] +; AVX512-NEXT: [[TMP25:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP5]] +; AVX512-NEXT: [[TMP26:%.*]] = getelementptr i32, i32* [[TMP22]], i32 0 ; AVX512-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP27]], i32 4, <16 x i1> [[TMP17]], <16 x i32> poison) -; AVX512-NEXT: [[TMP28:%.*]] = getelementptr i32, i32* [[TMP20]], i32 32 +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP27]], i32 4, <16 x i1> [[TMP18]], <16 x i32> poison) +; AVX512-NEXT: [[TMP28:%.*]] = getelementptr i32, i32* [[TMP22]], i32 16 ; AVX512-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP29]], i32 4, <16 x i1> [[TMP18]], <16 x i32> poison) -; AVX512-NEXT: [[TMP30:%.*]] = getelementptr i32, i32* [[TMP20]], i32 48 +; AVX512-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP29]], i32 4, <16 x i1> [[TMP19]], <16 x i32> poison) +; AVX512-NEXT: [[TMP30:%.*]] = getelementptr i32, i32* [[TMP22]], i32 32 ; AVX512-NEXT: [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP31]], i32 4, <16 x i1> [[TMP19]], <16 x i32> poison) -; AVX512-NEXT: [[TMP32:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] -; AVX512-NEXT: [[TMP33:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD15]], [[WIDE_LOAD12]] -; AVX512-NEXT: [[TMP34:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD16]], [[WIDE_LOAD13]] -; AVX512-NEXT: [[TMP35:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD17]], [[WIDE_LOAD14]] -; AVX512-NEXT: [[TMP36:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP0]] -; AVX512-NEXT: [[TMP37:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP1]] +; AVX512-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP31]], i32 4, <16 x i1> [[TMP20]], <16 x i32> poison) +; AVX512-NEXT: [[TMP32:%.*]] = getelementptr i32, i32* [[TMP22]], i32 48 +; AVX512-NEXT: [[TMP33:%.*]] = bitcast i32* [[TMP32]] to <16 x i32>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP33]], i32 4, <16 x i1> [[TMP21]], <16 x i32> poison) +; AVX512-NEXT: [[TMP34:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] +; AVX512-NEXT: [[TMP35:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD8]], [[WIDE_LOAD5]] +; AVX512-NEXT: [[TMP36:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_LOAD6]] +; AVX512-NEXT: [[TMP37:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD10]], [[WIDE_LOAD7]] ; AVX512-NEXT: [[TMP38:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP2]] ; AVX512-NEXT: [[TMP39:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP3]] -; AVX512-NEXT: [[TMP40:%.*]] = getelementptr i32, i32* [[TMP36]], i32 0 -; AVX512-NEXT: [[TMP41:%.*]] = bitcast i32* [[TMP40]] to <16 x i32>* -; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP32]], <16 x i32>* [[TMP41]], i32 4, <16 x i1> [[TMP16]]) -; AVX512-NEXT: [[TMP42:%.*]] = getelementptr i32, i32* [[TMP36]], i32 16 +; AVX512-NEXT: [[TMP40:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP4]] +; AVX512-NEXT: [[TMP41:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP5]] +; AVX512-NEXT: [[TMP42:%.*]] = getelementptr i32, i32* [[TMP38]], i32 0 ; AVX512-NEXT: [[TMP43:%.*]] = bitcast i32* [[TMP42]] to <16 x i32>* -; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP33]], <16 x i32>* [[TMP43]], i32 4, <16 x i1> [[TMP17]]) -; AVX512-NEXT: [[TMP44:%.*]] = getelementptr i32, i32* [[TMP36]], i32 32 +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP34]], <16 x i32>* [[TMP43]], i32 4, <16 x i1> [[TMP18]]) +; AVX512-NEXT: [[TMP44:%.*]] = getelementptr i32, i32* [[TMP38]], i32 16 ; AVX512-NEXT: [[TMP45:%.*]] = bitcast i32* [[TMP44]] to <16 x i32>* -; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP34]], <16 x i32>* [[TMP45]], i32 4, <16 x i1> [[TMP18]]) -; AVX512-NEXT: [[TMP46:%.*]] = getelementptr i32, i32* [[TMP36]], i32 48 +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP35]], <16 x i32>* [[TMP45]], i32 4, <16 x i1> [[TMP19]]) +; AVX512-NEXT: [[TMP46:%.*]] = getelementptr i32, i32* [[TMP38]], i32 32 ; AVX512-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <16 x i32>* -; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP35]], <16 x i32>* [[TMP47]], i32 4, <16 x i1> [[TMP19]]) +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP36]], <16 x i32>* [[TMP47]], i32 4, <16 x i1> [[TMP20]]) +; AVX512-NEXT: [[TMP48:%.*]] = getelementptr i32, i32* [[TMP38]], i32 48 +; AVX512-NEXT: [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <16 x i32>* +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP37]], <16 x i32>* [[TMP49]], i32 4, <16 x i1> [[TMP21]]) ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64 -; AVX512-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 -; AVX512-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; AVX512-NEXT: [[TMP50:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 +; AVX512-NEXT: br i1 [[TMP50]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 9984 ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] @@ -276,48 +276,48 @@ define void @foo1(i32* nocapture %A, i32* nocapture readonly %B, i32* nocapture ; AVX512-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; AVX512-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; AVX512: vec.epilog.vector.body: -; AVX512-NEXT: [[INDEX19:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT22:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; AVX512-NEXT: [[TMP49:%.*]] = add i64 [[INDEX19]], 0 -; AVX512-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP49]] -; AVX512-NEXT: [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[TMP50]], i32 0 -; AVX512-NEXT: [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_LOAD20:%.*]] = load <8 x i32>, <8 x i32>* [[TMP52]], align 4 -; AVX512-NEXT: [[TMP53:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD20]], -; AVX512-NEXT: [[TMP54:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP49]] -; AVX512-NEXT: [[TMP55:%.*]] = getelementptr i32, i32* [[TMP54]], i32 0 -; AVX512-NEXT: [[TMP56:%.*]] = bitcast i32* [[TMP55]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD21:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP56]], i32 4, <8 x i1> [[TMP53]], <8 x i32> poison) -; AVX512-NEXT: [[TMP57:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD21]], [[WIDE_LOAD20]] -; AVX512-NEXT: [[TMP58:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP49]] -; AVX512-NEXT: [[TMP59:%.*]] = getelementptr i32, i32* [[TMP58]], i32 0 -; AVX512-NEXT: [[TMP60:%.*]] = bitcast i32* [[TMP59]] to <8 x i32>* -; AVX512-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP57]], <8 x i32>* [[TMP60]], i32 4, <8 x i1> [[TMP53]]) -; AVX512-NEXT: [[INDEX_NEXT22]] = add nuw i64 [[INDEX19]], 8 -; AVX512-NEXT: [[TMP61:%.*]] = icmp eq i64 [[INDEX_NEXT22]], 10000 -; AVX512-NEXT: br i1 [[TMP61]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; AVX512-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; AVX512-NEXT: [[TMP51:%.*]] = add i64 [[OFFSET_IDX]], 0 +; AVX512-NEXT: [[TMP52:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP51]] +; AVX512-NEXT: [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[TMP52]], i32 0 +; AVX512-NEXT: [[TMP54:%.*]] = bitcast i32* [[TMP53]] to <8 x i32>* +; AVX512-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32>* [[TMP54]], align 4 +; AVX512-NEXT: [[TMP55:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], +; AVX512-NEXT: [[TMP56:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP51]] +; AVX512-NEXT: [[TMP57:%.*]] = getelementptr i32, i32* [[TMP56]], i32 0 +; AVX512-NEXT: [[TMP58:%.*]] = bitcast i32* [[TMP57]] to <8 x i32>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP58]], i32 4, <8 x i1> [[TMP55]], <8 x i32> poison) +; AVX512-NEXT: [[TMP59:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD14]], [[WIDE_LOAD13]] +; AVX512-NEXT: [[TMP60:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP51]] +; AVX512-NEXT: [[TMP61:%.*]] = getelementptr i32, i32* [[TMP60]], i32 0 +; AVX512-NEXT: [[TMP62:%.*]] = bitcast i32* [[TMP61]] to <8 x i32>* +; AVX512-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP59]], <8 x i32>* [[TMP62]], i32 4, <8 x i1> [[TMP55]]) +; AVX512-NEXT: [[INDEX_NEXT15]] = add nuw i64 [[OFFSET_IDX]], 8 +; AVX512-NEXT: [[TMP63:%.*]] = icmp eq i64 [[INDEX_NEXT15]], 10000 +; AVX512-NEXT: br i1 [[TMP63]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; AVX512: vec.epilog.middle.block: -; AVX512-NEXT: [[CMP_N18:%.*]] = icmp eq i64 10000, 10000 -; AVX512-NEXT: br i1 [[CMP_N18]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]] +; AVX512-NEXT: [[CMP_N11:%.*]] = icmp eq i64 10000, 10000 +; AVX512-NEXT: br i1 [[CMP_N11]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]] ; AVX512: vec.epilog.scalar.ph: ; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; AVX512-NEXT: br label [[FOR_BODY:%.*]] ; AVX512: for.body: ; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX512-NEXT: [[TMP62:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; AVX512-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP62]], 100 +; AVX512-NEXT: [[TMP64:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; AVX512-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP64]], 100 ; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; AVX512: if.then: ; AVX512-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] -; AVX512-NEXT: [[TMP63:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4 -; AVX512-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP63]], [[TMP62]] +; AVX512-NEXT: [[TMP65:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4 +; AVX512-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP65]], [[TMP64]] ; AVX512-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] ; AVX512-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX7]], align 4 ; AVX512-NEXT: br label [[FOR_INC]] ; AVX512: for.inc: ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; AVX512: for.end.loopexit: ; AVX512-NEXT: br label [[FOR_END]] ; AVX512: for.end: @@ -370,24 +370,24 @@ define void @foo1_addrspace1(i32 addrspace(1)* nocapture %A, i32 addrspace(1)* n ; AVX1-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX1: vector.body: ; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AVX1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; AVX1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP0]] -; AVX1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP1]], i32 0 -; AVX1-NEXT: [[TMP3:%.*]] = bitcast i32 addrspace(1)* [[TMP2]] to <8 x i32> addrspace(1)* -; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP3]], align 4 -; AVX1-NEXT: [[TMP4:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], -; AVX1-NEXT: [[TMP5:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP0]] -; AVX1-NEXT: [[TMP6:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP5]], i32 0 -; AVX1-NEXT: [[TMP7:%.*]] = bitcast i32 addrspace(1)* [[TMP6]] to <8 x i32> addrspace(1)* -; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP7]], i32 4, <8 x i1> [[TMP4]], <8 x i32> poison) -; AVX1-NEXT: [[TMP8:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] -; AVX1-NEXT: [[TMP9:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP0]] -; AVX1-NEXT: [[TMP10:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP9]], i32 0 -; AVX1-NEXT: [[TMP11:%.*]] = bitcast i32 addrspace(1)* [[TMP10]] to <8 x i32> addrspace(1)* -; AVX1-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP8]], <8 x i32> addrspace(1)* [[TMP11]], i32 4, <8 x i1> [[TMP4]]) +; AVX1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 +; AVX1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP2]] +; AVX1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP3]], i32 0 +; AVX1-NEXT: [[TMP5:%.*]] = bitcast i32 addrspace(1)* [[TMP4]] to <8 x i32> addrspace(1)* +; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP5]], align 4 +; AVX1-NEXT: [[TMP6:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], +; AVX1-NEXT: [[TMP7:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP2]] +; AVX1-NEXT: [[TMP8:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP7]], i32 0 +; AVX1-NEXT: [[TMP9:%.*]] = bitcast i32 addrspace(1)* [[TMP8]] to <8 x i32> addrspace(1)* +; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP9]], i32 4, <8 x i1> [[TMP6]], <8 x i32> poison) +; AVX1-NEXT: [[TMP10:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] +; AVX1-NEXT: [[TMP11:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP2]] +; AVX1-NEXT: [[TMP12:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP11]], i32 0 +; AVX1-NEXT: [[TMP13:%.*]] = bitcast i32 addrspace(1)* [[TMP12]] to <8 x i32> addrspace(1)* +; AVX1-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP10]], <8 x i32> addrspace(1)* [[TMP13]], i32 4, <8 x i1> [[TMP6]]) ; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; AVX1-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 -; AVX1-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; AVX1-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 +; AVX1-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; AVX1: middle.block: ; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 10000 ; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -397,20 +397,20 @@ define void @foo1_addrspace1(i32 addrspace(1)* nocapture %A, i32 addrspace(1)* n ; AVX1: for.body: ; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX1-NEXT: [[TMP13:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4 -; AVX1-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP13]], 100 +; AVX1-NEXT: [[TMP15:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4 +; AVX1-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP15]], 100 ; AVX1-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; AVX1: if.then: ; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV]] -; AVX1-NEXT: [[TMP14:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3]], align 4 -; AVX1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP14]], [[TMP13]] +; AVX1-NEXT: [[TMP16:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3]], align 4 +; AVX1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP15]] ; AVX1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV]] ; AVX1-NEXT: store i32 [[ADD]], i32 addrspace(1)* [[ARRAYIDX7]], align 4 ; AVX1-NEXT: br label [[FOR_INC]] ; AVX1: for.inc: ; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; AVX1: for.end: ; AVX1-NEXT: ret void ; @@ -431,69 +431,69 @@ define void @foo1_addrspace1(i32 addrspace(1)* nocapture %A, i32 addrspace(1)* n ; AVX2-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX2: vector.body: ; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AVX2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; AVX2-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8 -; AVX2-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 16 -; AVX2-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 24 -; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP0]] -; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP1]] +; AVX2-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 +; AVX2-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 8 +; AVX2-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 16 +; AVX2-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 24 ; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP2]] ; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP3]] -; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 0 -; AVX2-NEXT: [[TMP9:%.*]] = bitcast i32 addrspace(1)* [[TMP8]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP9]], align 4 -; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 8 +; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP4]] +; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP5]] +; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP6]], i32 0 ; AVX2-NEXT: [[TMP11:%.*]] = bitcast i32 addrspace(1)* [[TMP10]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP11]], align 4 -; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 16 +; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP11]], align 4 +; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP6]], i32 8 ; AVX2-NEXT: [[TMP13:%.*]] = bitcast i32 addrspace(1)* [[TMP12]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP13]], align 4 -; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 24 +; AVX2-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP13]], align 4 +; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP6]], i32 16 ; AVX2-NEXT: [[TMP15:%.*]] = bitcast i32 addrspace(1)* [[TMP14]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_LOAD14:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP15]], align 4 -; AVX2-NEXT: [[TMP16:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], -; AVX2-NEXT: [[TMP17:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], -; AVX2-NEXT: [[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], -; AVX2-NEXT: [[TMP19:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD14]], -; AVX2-NEXT: [[TMP20:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP0]] -; AVX2-NEXT: [[TMP21:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP1]] +; AVX2-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP15]], align 4 +; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP6]], i32 24 +; AVX2-NEXT: [[TMP17:%.*]] = bitcast i32 addrspace(1)* [[TMP16]] to <8 x i32> addrspace(1)* +; AVX2-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP17]], align 4 +; AVX2-NEXT: [[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], +; AVX2-NEXT: [[TMP19:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD5]], +; AVX2-NEXT: [[TMP20:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD6]], +; AVX2-NEXT: [[TMP21:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD7]], ; AVX2-NEXT: [[TMP22:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP2]] ; AVX2-NEXT: [[TMP23:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP3]] -; AVX2-NEXT: [[TMP24:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP20]], i32 0 -; AVX2-NEXT: [[TMP25:%.*]] = bitcast i32 addrspace(1)* [[TMP24]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP25]], i32 4, <8 x i1> [[TMP16]], <8 x i32> poison) -; AVX2-NEXT: [[TMP26:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP20]], i32 8 +; AVX2-NEXT: [[TMP24:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP4]] +; AVX2-NEXT: [[TMP25:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP5]] +; AVX2-NEXT: [[TMP26:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP22]], i32 0 ; AVX2-NEXT: [[TMP27:%.*]] = bitcast i32 addrspace(1)* [[TMP26]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP27]], i32 4, <8 x i1> [[TMP17]], <8 x i32> poison) -; AVX2-NEXT: [[TMP28:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP20]], i32 16 +; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP27]], i32 4, <8 x i1> [[TMP18]], <8 x i32> poison) +; AVX2-NEXT: [[TMP28:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP22]], i32 8 ; AVX2-NEXT: [[TMP29:%.*]] = bitcast i32 addrspace(1)* [[TMP28]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP29]], i32 4, <8 x i1> [[TMP18]], <8 x i32> poison) -; AVX2-NEXT: [[TMP30:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP20]], i32 24 +; AVX2-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP29]], i32 4, <8 x i1> [[TMP19]], <8 x i32> poison) +; AVX2-NEXT: [[TMP30:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP22]], i32 16 ; AVX2-NEXT: [[TMP31:%.*]] = bitcast i32 addrspace(1)* [[TMP30]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP31]], i32 4, <8 x i1> [[TMP19]], <8 x i32> poison) -; AVX2-NEXT: [[TMP32:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] -; AVX2-NEXT: [[TMP33:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD15]], [[WIDE_LOAD12]] -; AVX2-NEXT: [[TMP34:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD16]], [[WIDE_LOAD13]] -; AVX2-NEXT: [[TMP35:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD17]], [[WIDE_LOAD14]] -; AVX2-NEXT: [[TMP36:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP0]] -; AVX2-NEXT: [[TMP37:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP1]] +; AVX2-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP31]], i32 4, <8 x i1> [[TMP20]], <8 x i32> poison) +; AVX2-NEXT: [[TMP32:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP22]], i32 24 +; AVX2-NEXT: [[TMP33:%.*]] = bitcast i32 addrspace(1)* [[TMP32]] to <8 x i32> addrspace(1)* +; AVX2-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP33]], i32 4, <8 x i1> [[TMP21]], <8 x i32> poison) +; AVX2-NEXT: [[TMP34:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] +; AVX2-NEXT: [[TMP35:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD8]], [[WIDE_LOAD5]] +; AVX2-NEXT: [[TMP36:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_LOAD6]] +; AVX2-NEXT: [[TMP37:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD10]], [[WIDE_LOAD7]] ; AVX2-NEXT: [[TMP38:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP2]] ; AVX2-NEXT: [[TMP39:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP3]] -; AVX2-NEXT: [[TMP40:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP36]], i32 0 -; AVX2-NEXT: [[TMP41:%.*]] = bitcast i32 addrspace(1)* [[TMP40]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP32]], <8 x i32> addrspace(1)* [[TMP41]], i32 4, <8 x i1> [[TMP16]]) -; AVX2-NEXT: [[TMP42:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP36]], i32 8 +; AVX2-NEXT: [[TMP40:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP4]] +; AVX2-NEXT: [[TMP41:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP5]] +; AVX2-NEXT: [[TMP42:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP38]], i32 0 ; AVX2-NEXT: [[TMP43:%.*]] = bitcast i32 addrspace(1)* [[TMP42]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP33]], <8 x i32> addrspace(1)* [[TMP43]], i32 4, <8 x i1> [[TMP17]]) -; AVX2-NEXT: [[TMP44:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP36]], i32 16 +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP34]], <8 x i32> addrspace(1)* [[TMP43]], i32 4, <8 x i1> [[TMP18]]) +; AVX2-NEXT: [[TMP44:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP38]], i32 8 ; AVX2-NEXT: [[TMP45:%.*]] = bitcast i32 addrspace(1)* [[TMP44]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP34]], <8 x i32> addrspace(1)* [[TMP45]], i32 4, <8 x i1> [[TMP18]]) -; AVX2-NEXT: [[TMP46:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP36]], i32 24 +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP35]], <8 x i32> addrspace(1)* [[TMP45]], i32 4, <8 x i1> [[TMP19]]) +; AVX2-NEXT: [[TMP46:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP38]], i32 16 ; AVX2-NEXT: [[TMP47:%.*]] = bitcast i32 addrspace(1)* [[TMP46]] to <8 x i32> addrspace(1)* -; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP35]], <8 x i32> addrspace(1)* [[TMP47]], i32 4, <8 x i1> [[TMP19]]) +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP36]], <8 x i32> addrspace(1)* [[TMP47]], i32 4, <8 x i1> [[TMP20]]) +; AVX2-NEXT: [[TMP48:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP38]], i32 24 +; AVX2-NEXT: [[TMP49:%.*]] = bitcast i32 addrspace(1)* [[TMP48]] to <8 x i32> addrspace(1)* +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP37]], <8 x i32> addrspace(1)* [[TMP49]], i32 4, <8 x i1> [[TMP21]]) ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; AVX2-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 -; AVX2-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; AVX2-NEXT: [[TMP50:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 +; AVX2-NEXT: br i1 [[TMP50]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; AVX2: middle.block: ; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 9984 ; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -503,20 +503,20 @@ define void @foo1_addrspace1(i32 addrspace(1)* nocapture %A, i32 addrspace(1)* n ; AVX2: for.body: ; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX2-NEXT: [[TMP49:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4 -; AVX2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP49]], 100 +; AVX2-NEXT: [[TMP51:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4 +; AVX2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP51]], 100 ; AVX2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; AVX2: if.then: ; AVX2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV]] -; AVX2-NEXT: [[TMP50:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3]], align 4 -; AVX2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP50]], [[TMP49]] +; AVX2-NEXT: [[TMP52:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3]], align 4 +; AVX2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP52]], [[TMP51]] ; AVX2-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV]] ; AVX2-NEXT: store i32 [[ADD]], i32 addrspace(1)* [[ARRAYIDX7]], align 4 ; AVX2-NEXT: br label [[FOR_INC]] ; AVX2: for.inc: ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; AVX2: for.end: ; AVX2-NEXT: ret void ; @@ -539,69 +539,69 @@ define void @foo1_addrspace1(i32 addrspace(1)* nocapture %A, i32 addrspace(1)* n ; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX512: vector.body: ; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AVX512-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; AVX512-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 16 -; AVX512-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 32 -; AVX512-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 48 -; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP0]] -; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP1]] +; AVX512-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 +; AVX512-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 16 +; AVX512-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 32 +; AVX512-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 48 ; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP2]] ; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP3]] -; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 0 -; AVX512-NEXT: [[TMP9:%.*]] = bitcast i32 addrspace(1)* [[TMP8]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP9]], align 4 -; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 16 +; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP4]] +; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP5]] +; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP6]], i32 0 ; AVX512-NEXT: [[TMP11:%.*]] = bitcast i32 addrspace(1)* [[TMP10]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP11]], align 4 -; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 32 +; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP11]], align 4 +; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP6]], i32 16 ; AVX512-NEXT: [[TMP13:%.*]] = bitcast i32 addrspace(1)* [[TMP12]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_LOAD13:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP13]], align 4 -; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP4]], i32 48 +; AVX512-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP13]], align 4 +; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP6]], i32 32 ; AVX512-NEXT: [[TMP15:%.*]] = bitcast i32 addrspace(1)* [[TMP14]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_LOAD14:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP15]], align 4 -; AVX512-NEXT: [[TMP16:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], -; AVX512-NEXT: [[TMP17:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD12]], -; AVX512-NEXT: [[TMP18:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD13]], -; AVX512-NEXT: [[TMP19:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD14]], -; AVX512-NEXT: [[TMP20:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP0]] -; AVX512-NEXT: [[TMP21:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP1]] +; AVX512-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP15]], align 4 +; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP6]], i32 48 +; AVX512-NEXT: [[TMP17:%.*]] = bitcast i32 addrspace(1)* [[TMP16]] to <16 x i32> addrspace(1)* +; AVX512-NEXT: [[WIDE_LOAD7:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP17]], align 4 +; AVX512-NEXT: [[TMP18:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], +; AVX512-NEXT: [[TMP19:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD5]], +; AVX512-NEXT: [[TMP20:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD6]], +; AVX512-NEXT: [[TMP21:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD7]], ; AVX512-NEXT: [[TMP22:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP2]] ; AVX512-NEXT: [[TMP23:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP3]] -; AVX512-NEXT: [[TMP24:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP20]], i32 0 -; AVX512-NEXT: [[TMP25:%.*]] = bitcast i32 addrspace(1)* [[TMP24]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP25]], i32 4, <16 x i1> [[TMP16]], <16 x i32> poison) -; AVX512-NEXT: [[TMP26:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP20]], i32 16 +; AVX512-NEXT: [[TMP24:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP4]] +; AVX512-NEXT: [[TMP25:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP5]] +; AVX512-NEXT: [[TMP26:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP22]], i32 0 ; AVX512-NEXT: [[TMP27:%.*]] = bitcast i32 addrspace(1)* [[TMP26]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP27]], i32 4, <16 x i1> [[TMP17]], <16 x i32> poison) -; AVX512-NEXT: [[TMP28:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP20]], i32 32 +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP27]], i32 4, <16 x i1> [[TMP18]], <16 x i32> poison) +; AVX512-NEXT: [[TMP28:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP22]], i32 16 ; AVX512-NEXT: [[TMP29:%.*]] = bitcast i32 addrspace(1)* [[TMP28]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP29]], i32 4, <16 x i1> [[TMP18]], <16 x i32> poison) -; AVX512-NEXT: [[TMP30:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP20]], i32 48 +; AVX512-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP29]], i32 4, <16 x i1> [[TMP19]], <16 x i32> poison) +; AVX512-NEXT: [[TMP30:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP22]], i32 32 ; AVX512-NEXT: [[TMP31:%.*]] = bitcast i32 addrspace(1)* [[TMP30]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP31]], i32 4, <16 x i1> [[TMP19]], <16 x i32> poison) -; AVX512-NEXT: [[TMP32:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] -; AVX512-NEXT: [[TMP33:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD15]], [[WIDE_LOAD12]] -; AVX512-NEXT: [[TMP34:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD16]], [[WIDE_LOAD13]] -; AVX512-NEXT: [[TMP35:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD17]], [[WIDE_LOAD14]] -; AVX512-NEXT: [[TMP36:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP0]] -; AVX512-NEXT: [[TMP37:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP1]] +; AVX512-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP31]], i32 4, <16 x i1> [[TMP20]], <16 x i32> poison) +; AVX512-NEXT: [[TMP32:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP22]], i32 48 +; AVX512-NEXT: [[TMP33:%.*]] = bitcast i32 addrspace(1)* [[TMP32]] to <16 x i32> addrspace(1)* +; AVX512-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP33]], i32 4, <16 x i1> [[TMP21]], <16 x i32> poison) +; AVX512-NEXT: [[TMP34:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] +; AVX512-NEXT: [[TMP35:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD8]], [[WIDE_LOAD5]] +; AVX512-NEXT: [[TMP36:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_LOAD6]] +; AVX512-NEXT: [[TMP37:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD10]], [[WIDE_LOAD7]] ; AVX512-NEXT: [[TMP38:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP2]] ; AVX512-NEXT: [[TMP39:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP3]] -; AVX512-NEXT: [[TMP40:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP36]], i32 0 -; AVX512-NEXT: [[TMP41:%.*]] = bitcast i32 addrspace(1)* [[TMP40]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP32]], <16 x i32> addrspace(1)* [[TMP41]], i32 4, <16 x i1> [[TMP16]]) -; AVX512-NEXT: [[TMP42:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP36]], i32 16 +; AVX512-NEXT: [[TMP40:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP4]] +; AVX512-NEXT: [[TMP41:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP5]] +; AVX512-NEXT: [[TMP42:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP38]], i32 0 ; AVX512-NEXT: [[TMP43:%.*]] = bitcast i32 addrspace(1)* [[TMP42]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP33]], <16 x i32> addrspace(1)* [[TMP43]], i32 4, <16 x i1> [[TMP17]]) -; AVX512-NEXT: [[TMP44:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP36]], i32 32 +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP34]], <16 x i32> addrspace(1)* [[TMP43]], i32 4, <16 x i1> [[TMP18]]) +; AVX512-NEXT: [[TMP44:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP38]], i32 16 ; AVX512-NEXT: [[TMP45:%.*]] = bitcast i32 addrspace(1)* [[TMP44]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP34]], <16 x i32> addrspace(1)* [[TMP45]], i32 4, <16 x i1> [[TMP18]]) -; AVX512-NEXT: [[TMP46:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP36]], i32 48 +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP35]], <16 x i32> addrspace(1)* [[TMP45]], i32 4, <16 x i1> [[TMP19]]) +; AVX512-NEXT: [[TMP46:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP38]], i32 32 ; AVX512-NEXT: [[TMP47:%.*]] = bitcast i32 addrspace(1)* [[TMP46]] to <16 x i32> addrspace(1)* -; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP35]], <16 x i32> addrspace(1)* [[TMP47]], i32 4, <16 x i1> [[TMP19]]) +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP36]], <16 x i32> addrspace(1)* [[TMP47]], i32 4, <16 x i1> [[TMP20]]) +; AVX512-NEXT: [[TMP48:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP38]], i32 48 +; AVX512-NEXT: [[TMP49:%.*]] = bitcast i32 addrspace(1)* [[TMP48]] to <16 x i32> addrspace(1)* +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP37]], <16 x i32> addrspace(1)* [[TMP49]], i32 4, <16 x i1> [[TMP21]]) ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64 -; AVX512-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 -; AVX512-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] +; AVX512-NEXT: [[TMP50:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 +; AVX512-NEXT: br i1 [[TMP50]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 9984 ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] @@ -611,48 +611,48 @@ define void @foo1_addrspace1(i32 addrspace(1)* nocapture %A, i32 addrspace(1)* n ; AVX512-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; AVX512-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; AVX512: vec.epilog.vector.body: -; AVX512-NEXT: [[INDEX19:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT22:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; AVX512-NEXT: [[TMP49:%.*]] = add i64 [[INDEX19]], 0 -; AVX512-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP49]] -; AVX512-NEXT: [[TMP51:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP50]], i32 0 -; AVX512-NEXT: [[TMP52:%.*]] = bitcast i32 addrspace(1)* [[TMP51]] to <8 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_LOAD20:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP52]], align 4 -; AVX512-NEXT: [[TMP53:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD20]], -; AVX512-NEXT: [[TMP54:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP49]] -; AVX512-NEXT: [[TMP55:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP54]], i32 0 -; AVX512-NEXT: [[TMP56:%.*]] = bitcast i32 addrspace(1)* [[TMP55]] to <8 x i32> addrspace(1)* -; AVX512-NEXT: [[WIDE_MASKED_LOAD21:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP56]], i32 4, <8 x i1> [[TMP53]], <8 x i32> poison) -; AVX512-NEXT: [[TMP57:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD21]], [[WIDE_LOAD20]] -; AVX512-NEXT: [[TMP58:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP49]] -; AVX512-NEXT: [[TMP59:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP58]], i32 0 -; AVX512-NEXT: [[TMP60:%.*]] = bitcast i32 addrspace(1)* [[TMP59]] to <8 x i32> addrspace(1)* -; AVX512-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP57]], <8 x i32> addrspace(1)* [[TMP60]], i32 4, <8 x i1> [[TMP53]]) -; AVX512-NEXT: [[INDEX_NEXT22]] = add nuw i64 [[INDEX19]], 8 -; AVX512-NEXT: [[TMP61:%.*]] = icmp eq i64 [[INDEX_NEXT22]], 10000 -; AVX512-NEXT: br i1 [[TMP61]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; AVX512-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; AVX512-NEXT: [[TMP51:%.*]] = add i64 [[OFFSET_IDX]], 0 +; AVX512-NEXT: [[TMP52:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP51]] +; AVX512-NEXT: [[TMP53:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP52]], i32 0 +; AVX512-NEXT: [[TMP54:%.*]] = bitcast i32 addrspace(1)* [[TMP53]] to <8 x i32> addrspace(1)* +; AVX512-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP54]], align 4 +; AVX512-NEXT: [[TMP55:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], +; AVX512-NEXT: [[TMP56:%.*]] = getelementptr i32, i32 addrspace(1)* [[B]], i64 [[TMP51]] +; AVX512-NEXT: [[TMP57:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP56]], i32 0 +; AVX512-NEXT: [[TMP58:%.*]] = bitcast i32 addrspace(1)* [[TMP57]] to <8 x i32> addrspace(1)* +; AVX512-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP58]], i32 4, <8 x i1> [[TMP55]], <8 x i32> poison) +; AVX512-NEXT: [[TMP59:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD14]], [[WIDE_LOAD13]] +; AVX512-NEXT: [[TMP60:%.*]] = getelementptr i32, i32 addrspace(1)* [[A]], i64 [[TMP51]] +; AVX512-NEXT: [[TMP61:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP60]], i32 0 +; AVX512-NEXT: [[TMP62:%.*]] = bitcast i32 addrspace(1)* [[TMP61]] to <8 x i32> addrspace(1)* +; AVX512-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP59]], <8 x i32> addrspace(1)* [[TMP62]], i32 4, <8 x i1> [[TMP55]]) +; AVX512-NEXT: [[INDEX_NEXT15]] = add nuw i64 [[OFFSET_IDX]], 8 +; AVX512-NEXT: [[TMP63:%.*]] = icmp eq i64 [[INDEX_NEXT15]], 10000 +; AVX512-NEXT: br i1 [[TMP63]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; AVX512: vec.epilog.middle.block: -; AVX512-NEXT: [[CMP_N18:%.*]] = icmp eq i64 10000, 10000 -; AVX512-NEXT: br i1 [[CMP_N18]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]] +; AVX512-NEXT: [[CMP_N11:%.*]] = icmp eq i64 10000, 10000 +; AVX512-NEXT: br i1 [[CMP_N11]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]] ; AVX512: vec.epilog.scalar.ph: ; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; AVX512-NEXT: br label [[FOR_BODY:%.*]] ; AVX512: for.body: ; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX512-NEXT: [[TMP62:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4 -; AVX512-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP62]], 100 +; AVX512-NEXT: [[TMP64:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4 +; AVX512-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP64]], 100 ; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; AVX512: if.then: ; AVX512-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV]] -; AVX512-NEXT: [[TMP63:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3]], align 4 -; AVX512-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP63]], [[TMP62]] +; AVX512-NEXT: [[TMP65:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3]], align 4 +; AVX512-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP65]], [[TMP64]] ; AVX512-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV]] ; AVX512-NEXT: store i32 [[ADD]], i32 addrspace(1)* [[ARRAYIDX7]], align 4 ; AVX512-NEXT: br label [[FOR_INC]] ; AVX512: for.inc: ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] +; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; AVX512: for.end.loopexit: ; AVX512-NEXT: br label [[FOR_END]] ; AVX512: for.end: @@ -714,25 +714,25 @@ define void @foo2(float* nocapture %A, float* nocapture readonly %B, i32* nocapt ; AVX1-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX1: vector.body: ; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AVX1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; AVX1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]] -; AVX1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 -; AVX1-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <8 x i32>* -; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 4 -; AVX1-NEXT: [[TMP4:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], -; AVX1-NEXT: [[TMP5:%.*]] = getelementptr float, float* [[B]], i64 [[TMP0]] -; AVX1-NEXT: [[TMP6:%.*]] = getelementptr float, float* [[TMP5]], i32 0 -; AVX1-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <8 x float>* -; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP7]], i32 4, <8 x i1> [[TMP4]], <8 x float> poison) -; AVX1-NEXT: [[TMP8:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float> -; AVX1-NEXT: [[TMP9:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD]], [[TMP8]] -; AVX1-NEXT: [[TMP10:%.*]] = getelementptr float, float* [[A]], i64 [[TMP0]] -; AVX1-NEXT: [[TMP11:%.*]] = getelementptr float, float* [[TMP10]], i32 0 -; AVX1-NEXT: [[TMP12:%.*]] = bitcast float* [[TMP11]] to <8 x float>* -; AVX1-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP9]], <8 x float>* [[TMP12]], i32 4, <8 x i1> [[TMP4]]) +; AVX1-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 +; AVX1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]] +; AVX1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0 +; AVX1-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>* +; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP5]], align 4 +; AVX1-NEXT: [[TMP6:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], +; AVX1-NEXT: [[TMP7:%.*]] = getelementptr float, float* [[B]], i64 [[TMP2]] +; AVX1-NEXT: [[TMP8:%.*]] = getelementptr float, float* [[TMP7]], i32 0 +; AVX1-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <8 x float>* +; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP9]], i32 4, <8 x i1> [[TMP6]], <8 x float> poison) +; AVX1-NEXT: [[TMP10:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float> +; AVX1-NEXT: [[TMP11:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD]], [[TMP10]] +; AVX1-NEXT: [[TMP12:%.*]] = getelementptr float, float* [[A]], i64 [[TMP2]] +; AVX1-NEXT: [[TMP13:%.*]] = getelementptr float, float* [[TMP12]], i32 0 +; AVX1-NEXT: [[TMP14:%.*]] = bitcast float* [[TMP13]] to <8 x float>* +; AVX1-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP11]], <8 x float>* [[TMP14]], i32 4, <8 x i1> [[TMP6]]) ; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; AVX1-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 -; AVX1-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]] +; AVX1-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 +; AVX1-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; AVX1: middle.block: ; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 10000 ; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -742,21 +742,21 @@ define void @foo2(float* nocapture %A, float* nocapture readonly %B, i32* nocapt ; AVX1: for.body: ; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX1-NEXT: [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; AVX1-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP14]], 100 +; AVX1-NEXT: [[TMP16:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; AVX1-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP16]], 100 ; AVX1-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; AVX1: if.then: ; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]] -; AVX1-NEXT: [[TMP15:%.*]] = load float, float* [[ARRAYIDX3]], align 4 -; AVX1-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP14]] to float -; AVX1-NEXT: [[ADD:%.*]] = fadd float [[TMP15]], [[CONV]] +; AVX1-NEXT: [[TMP17:%.*]] = load float, float* [[ARRAYIDX3]], align 4 +; AVX1-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP16]] to float +; AVX1-NEXT: [[ADD:%.*]] = fadd float [[TMP17]], [[CONV]] ; AVX1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]] ; AVX1-NEXT: store float [[ADD]], float* [[ARRAYIDX7]], align 4 ; AVX1-NEXT: br label [[FOR_INC]] ; AVX1: for.inc: ; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] +; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; AVX1: for.end: ; AVX1-NEXT: ret void ; @@ -777,73 +777,73 @@ define void @foo2(float* nocapture %A, float* nocapture readonly %B, i32* nocapt ; AVX2-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX2: vector.body: ; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AVX2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; AVX2-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8 -; AVX2-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 16 -; AVX2-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 24 -; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]] -; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP1]] +; AVX2-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 +; AVX2-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 8 +; AVX2-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 16 +; AVX2-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 24 ; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]] ; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]] -; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 -; AVX2-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4 -; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 8 +; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP4]] +; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP5]] +; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0 ; AVX2-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i32>, <8 x i32>* [[TMP11]], align 4 -; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 16 +; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP11]], align 4 +; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 8 ; AVX2-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4 -; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 24 +; AVX2-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4 +; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 16 ; AVX2-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <8 x i32>* -; AVX2-NEXT: [[WIDE_LOAD14:%.*]] = load <8 x i32>, <8 x i32>* [[TMP15]], align 4 -; AVX2-NEXT: [[TMP16:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], -; AVX2-NEXT: [[TMP17:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD12]], -; AVX2-NEXT: [[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], -; AVX2-NEXT: [[TMP19:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD14]], -; AVX2-NEXT: [[TMP20:%.*]] = getelementptr float, float* [[B]], i64 [[TMP0]] -; AVX2-NEXT: [[TMP21:%.*]] = getelementptr float, float* [[B]], i64 [[TMP1]] +; AVX2-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i32>, <8 x i32>* [[TMP15]], align 4 +; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 24 +; AVX2-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <8 x i32>* +; AVX2-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x i32>, <8 x i32>* [[TMP17]], align 4 +; AVX2-NEXT: [[TMP18:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], +; AVX2-NEXT: [[TMP19:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD5]], +; AVX2-NEXT: [[TMP20:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD6]], +; AVX2-NEXT: [[TMP21:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD7]], ; AVX2-NEXT: [[TMP22:%.*]] = getelementptr float, float* [[B]], i64 [[TMP2]] ; AVX2-NEXT: [[TMP23:%.*]] = getelementptr float, float* [[B]], i64 [[TMP3]] -; AVX2-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[TMP20]], i32 0 -; AVX2-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to <8 x float>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP25]], i32 4, <8 x i1> [[TMP16]], <8 x float> poison) -; AVX2-NEXT: [[TMP26:%.*]] = getelementptr float, float* [[TMP20]], i32 8 +; AVX2-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[B]], i64 [[TMP4]] +; AVX2-NEXT: [[TMP25:%.*]] = getelementptr float, float* [[B]], i64 [[TMP5]] +; AVX2-NEXT: [[TMP26:%.*]] = getelementptr float, float* [[TMP22]], i32 0 ; AVX2-NEXT: [[TMP27:%.*]] = bitcast float* [[TMP26]] to <8 x float>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP27]], i32 4, <8 x i1> [[TMP17]], <8 x float> poison) -; AVX2-NEXT: [[TMP28:%.*]] = getelementptr float, float* [[TMP20]], i32 16 +; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP27]], i32 4, <8 x i1> [[TMP18]], <8 x float> poison) +; AVX2-NEXT: [[TMP28:%.*]] = getelementptr float, float* [[TMP22]], i32 8 ; AVX2-NEXT: [[TMP29:%.*]] = bitcast float* [[TMP28]] to <8 x float>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP29]], i32 4, <8 x i1> [[TMP18]], <8 x float> poison) -; AVX2-NEXT: [[TMP30:%.*]] = getelementptr float, float* [[TMP20]], i32 24 +; AVX2-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP29]], i32 4, <8 x i1> [[TMP19]], <8 x float> poison) +; AVX2-NEXT: [[TMP30:%.*]] = getelementptr float, float* [[TMP22]], i32 16 ; AVX2-NEXT: [[TMP31:%.*]] = bitcast float* [[TMP30]] to <8 x float>* -; AVX2-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP31]], i32 4, <8 x i1> [[TMP19]], <8 x float> poison) -; AVX2-NEXT: [[TMP32:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float> -; AVX2-NEXT: [[TMP33:%.*]] = sitofp <8 x i32> [[WIDE_LOAD12]] to <8 x float> -; AVX2-NEXT: [[TMP34:%.*]] = sitofp <8 x i32> [[WIDE_LOAD13]] to <8 x float> -; AVX2-NEXT: [[TMP35:%.*]] = sitofp <8 x i32> [[WIDE_LOAD14]] to <8 x float> -; AVX2-NEXT: [[TMP36:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD]], [[TMP32]] -; AVX2-NEXT: [[TMP37:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD15]], [[TMP33]] -; AVX2-NEXT: [[TMP38:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD16]], [[TMP34]] -; AVX2-NEXT: [[TMP39:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD17]], [[TMP35]] -; AVX2-NEXT: [[TMP40:%.*]] = getelementptr float, float* [[A]], i64 [[TMP0]] -; AVX2-NEXT: [[TMP41:%.*]] = getelementptr float, float* [[A]], i64 [[TMP1]] +; AVX2-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP31]], i32 4, <8 x i1> [[TMP20]], <8 x float> poison) +; AVX2-NEXT: [[TMP32:%.*]] = getelementptr float, float* [[TMP22]], i32 24 +; AVX2-NEXT: [[TMP33:%.*]] = bitcast float* [[TMP32]] to <8 x float>* +; AVX2-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP33]], i32 4, <8 x i1> [[TMP21]], <8 x float> poison) +; AVX2-NEXT: [[TMP34:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float> +; AVX2-NEXT: [[TMP35:%.*]] = sitofp <8 x i32> [[WIDE_LOAD5]] to <8 x float> +; AVX2-NEXT: [[TMP36:%.*]] = sitofp <8 x i32> [[WIDE_LOAD6]] to <8 x float> +; AVX2-NEXT: [[TMP37:%.*]] = sitofp <8 x i32> [[WIDE_LOAD7]] to <8 x float> +; AVX2-NEXT: [[TMP38:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD]], [[TMP34]] +; AVX2-NEXT: [[TMP39:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD8]], [[TMP35]] +; AVX2-NEXT: [[TMP40:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD9]], [[TMP36]] +; AVX2-NEXT: [[TMP41:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD10]], [[TMP37]] ; AVX2-NEXT: [[TMP42:%.*]] = getelementptr float, float* [[A]], i64 [[TMP2]] ; AVX2-NEXT: [[TMP43:%.*]] = getelementptr float, float* [[A]], i64 [[TMP3]] -; AVX2-NEXT: [[TMP44:%.*]] = getelementptr float, float* [[TMP40]], i32 0 -; AVX2-NEXT: [[TMP45:%.*]] = bitcast float* [[TMP44]] to <8 x float>* -; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP36]], <8 x float>* [[TMP45]], i32 4, <8 x i1> [[TMP16]]) -; AVX2-NEXT: [[TMP46:%.*]] = getelementptr float, float* [[TMP40]], i32 8 +; AVX2-NEXT: [[TMP44:%.*]] = getelementptr float, float* [[A]], i64 [[TMP4]] +; AVX2-NEXT: [[TMP45:%.*]] = getelementptr float, float* [[A]], i64 [[TMP5]] +; AVX2-NEXT: [[TMP46:%.*]] = getelementptr float, float* [[TMP42]], i32 0 ; AVX2-NEXT: [[TMP47:%.*]] = bitcast float* [[TMP46]] to <8 x float>* -; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP37]], <8 x float>* [[TMP47]], i32 4, <8 x i1> [[TMP17]]) -; AVX2-NEXT: [[TMP48:%.*]] = getelementptr float, float* [[TMP40]], i32 16 +; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP38]], <8 x float>* [[TMP47]], i32 4, <8 x i1> [[TMP18]]) +; AVX2-NEXT: [[TMP48:%.*]] = getelementptr float, float* [[TMP42]], i32 8 ; AVX2-NEXT: [[TMP49:%.*]] = bitcast float* [[TMP48]] to <8 x float>* -; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP38]], <8 x float>* [[TMP49]], i32 4, <8 x i1> [[TMP18]]) -; AVX2-NEXT: [[TMP50:%.*]] = getelementptr float, float* [[TMP40]], i32 24 +; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP39]], <8 x float>* [[TMP49]], i32 4, <8 x i1> [[TMP19]]) +; AVX2-NEXT: [[TMP50:%.*]] = getelementptr float, float* [[TMP42]], i32 16 ; AVX2-NEXT: [[TMP51:%.*]] = bitcast float* [[TMP50]] to <8 x float>* -; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP39]], <8 x float>* [[TMP51]], i32 4, <8 x i1> [[TMP19]]) +; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP40]], <8 x float>* [[TMP51]], i32 4, <8 x i1> [[TMP20]]) +; AVX2-NEXT: [[TMP52:%.*]] = getelementptr float, float* [[TMP42]], i32 24 +; AVX2-NEXT: [[TMP53:%.*]] = bitcast float* [[TMP52]] to <8 x float>* +; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP41]], <8 x float>* [[TMP53]], i32 4, <8 x i1> [[TMP21]]) ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; AVX2-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 -; AVX2-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]] +; AVX2-NEXT: [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 +; AVX2-NEXT: br i1 [[TMP54]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; AVX2: middle.block: ; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 9984 ; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -853,21 +853,21 @@ define void @foo2(float* nocapture %A, float* nocapture readonly %B, i32* nocapt ; AVX2: for.body: ; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX2-NEXT: [[TMP53:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; AVX2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP53]], 100 +; AVX2-NEXT: [[TMP55:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; AVX2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP55]], 100 ; AVX2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; AVX2: if.then: ; AVX2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]] -; AVX2-NEXT: [[TMP54:%.*]] = load float, float* [[ARRAYIDX3]], align 4 -; AVX2-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP53]] to float -; AVX2-NEXT: [[ADD:%.*]] = fadd float [[TMP54]], [[CONV]] +; AVX2-NEXT: [[TMP56:%.*]] = load float, float* [[ARRAYIDX3]], align 4 +; AVX2-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP55]] to float +; AVX2-NEXT: [[ADD:%.*]] = fadd float [[TMP56]], [[CONV]] ; AVX2-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]] ; AVX2-NEXT: store float [[ADD]], float* [[ARRAYIDX7]], align 4 ; AVX2-NEXT: br label [[FOR_INC]] ; AVX2: for.inc: ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] +; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; AVX2: for.end: ; AVX2-NEXT: ret void ; @@ -890,73 +890,73 @@ define void @foo2(float* nocapture %A, float* nocapture readonly %B, i32* nocapt ; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] ; AVX512: vector.body: ; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; AVX512-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; AVX512-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 16 -; AVX512-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 32 -; AVX512-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 48 -; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP0]] -; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP1]] +; AVX512-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0 +; AVX512-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 16 +; AVX512-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 32 +; AVX512-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 48 ; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP2]] ; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP3]] -; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 -; AVX512-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP9]], align 4 -; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 16 +; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP4]] +; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP5]] +; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0 ; AVX512-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i32>, <16 x i32>* [[TMP11]], align 4 -; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 32 +; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP11]], align 4 +; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 16 ; AVX512-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_LOAD13:%.*]] = load <16 x i32>, <16 x i32>* [[TMP13]], align 4 -; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 48 +; AVX512-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i32>, <16 x i32>* [[TMP13]], align 4 +; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 32 ; AVX512-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <16 x i32>* -; AVX512-NEXT: [[WIDE_LOAD14:%.*]] = load <16 x i32>, <16 x i32>* [[TMP15]], align 4 -; AVX512-NEXT: [[TMP16:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], -; AVX512-NEXT: [[TMP17:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD12]], -; AVX512-NEXT: [[TMP18:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD13]], -; AVX512-NEXT: [[TMP19:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD14]], -; AVX512-NEXT: [[TMP20:%.*]] = getelementptr float, float* [[B]], i64 [[TMP0]] -; AVX512-NEXT: [[TMP21:%.*]] = getelementptr float, float* [[B]], i64 [[TMP1]] +; AVX512-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i32>, <16 x i32>* [[TMP15]], align 4 +; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 48 +; AVX512-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <16 x i32>* +; AVX512-NEXT: [[WIDE_LOAD7:%.*]] = load <16 x i32>, <16 x i32>* [[TMP17]], align 4 +; AVX512-NEXT: [[TMP18:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], +; AVX512-NEXT: [[TMP19:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD5]], +; AVX512-NEXT: [[TMP20:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD6]], +; AVX512-NEXT: [[TMP21:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD7]], ; AVX512-NEXT: [[TMP22:%.*]] = getelementptr float, float* [[B]], i64 [[TMP2]] ; AVX512-NEXT: [[TMP23:%.*]] = getelementptr float, float* [[B]], i64 [[TMP3]] -; AVX512-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[TMP20]], i32 0 -; AVX512-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to <16 x float>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP25]], i32 4, <16 x i1> [[TMP16]], <16 x float> poison) -; AVX512-NEXT: [[TMP26:%.*]] = getelementptr float, float* [[TMP20]], i32 16 +; AVX512-NEXT: [[TMP24:%.*]] = getelementptr float, float* [[B]], i64 [[TMP4]] +; AVX512-NEXT: [[TMP25:%.*]] = getelementptr float, float* [[B]], i64 [[TMP5]] +; AVX512-NEXT: [[TMP26:%.*]] = getelementptr float, float* [[TMP22]], i32 0 ; AVX512-NEXT: [[TMP27:%.*]] = bitcast float* [[TMP26]] to <16 x float>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP27]], i32 4, <16 x i1> [[TMP17]], <16 x float> poison) -; AVX512-NEXT: [[TMP28:%.*]] = getelementptr float, float* [[TMP20]], i32 32 +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP27]], i32 4, <16 x i1> [[TMP18]], <16 x float> poison) +; AVX512-NEXT: [[TMP28:%.*]] = getelementptr float, float* [[TMP22]], i32 16 ; AVX512-NEXT: [[TMP29:%.*]] = bitcast float* [[TMP28]] to <16 x float>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD16:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP29]], i32 4, <16 x i1> [[TMP18]], <16 x float> poison) -; AVX512-NEXT: [[TMP30:%.*]] = getelementptr float, float* [[TMP20]], i32 48 +; AVX512-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP29]], i32 4, <16 x i1> [[TMP19]], <16 x float> poison) +; AVX512-NEXT: [[TMP30:%.*]] = getelementptr float, float* [[TMP22]], i32 32 ; AVX512-NEXT: [[TMP31:%.*]] = bitcast float* [[TMP30]] to <16 x float>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD17:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP31]], i32 4, <16 x i1> [[TMP19]], <16 x float> poison) -; AVX512-NEXT: [[TMP32:%.*]] = sitofp <16 x i32> [[WIDE_LOAD]] to <16 x float> -; AVX512-NEXT: [[TMP33:%.*]] = sitofp <16 x i32> [[WIDE_LOAD12]] to <16 x float> -; AVX512-NEXT: [[TMP34:%.*]] = sitofp <16 x i32> [[WIDE_LOAD13]] to <16 x float> -; AVX512-NEXT: [[TMP35:%.*]] = sitofp <16 x i32> [[WIDE_LOAD14]] to <16 x float> -; AVX512-NEXT: [[TMP36:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD]], [[TMP32]] -; AVX512-NEXT: [[TMP37:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD15]], [[TMP33]] -; AVX512-NEXT: [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD16]], [[TMP34]] -; AVX512-NEXT: [[TMP39:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD17]], [[TMP35]] -; AVX512-NEXT: [[TMP40:%.*]] = getelementptr float, float* [[A]], i64 [[TMP0]] -; AVX512-NEXT: [[TMP41:%.*]] = getelementptr float, float* [[A]], i64 [[TMP1]] +; AVX512-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP31]], i32 4, <16 x i1> [[TMP20]], <16 x float> poison) +; AVX512-NEXT: [[TMP32:%.*]] = getelementptr float, float* [[TMP22]], i32 48 +; AVX512-NEXT: [[TMP33:%.*]] = bitcast float* [[TMP32]] to <16 x float>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP33]], i32 4, <16 x i1> [[TMP21]], <16 x float> poison) +; AVX512-NEXT: [[TMP34:%.*]] = sitofp <16 x i32> [[WIDE_LOAD]] to <16 x float> +; AVX512-NEXT: [[TMP35:%.*]] = sitofp <16 x i32> [[WIDE_LOAD5]] to <16 x float> +; AVX512-NEXT: [[TMP36:%.*]] = sitofp <16 x i32> [[WIDE_LOAD6]] to <16 x float> +; AVX512-NEXT: [[TMP37:%.*]] = sitofp <16 x i32> [[WIDE_LOAD7]] to <16 x float> +; AVX512-NEXT: [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD]], [[TMP34]] +; AVX512-NEXT: [[TMP39:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD8]], [[TMP35]] +; AVX512-NEXT: [[TMP40:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD9]], [[TMP36]] +; AVX512-NEXT: [[TMP41:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD10]], [[TMP37]] ; AVX512-NEXT: [[TMP42:%.*]] = getelementptr float, float* [[A]], i64 [[TMP2]] ; AVX512-NEXT: [[TMP43:%.*]] = getelementptr float, float* [[A]], i64 [[TMP3]] -; AVX512-NEXT: [[TMP44:%.*]] = getelementptr float, float* [[TMP40]], i32 0 -; AVX512-NEXT: [[TMP45:%.*]] = bitcast float* [[TMP44]] to <16 x float>* -; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP36]], <16 x float>* [[TMP45]], i32 4, <16 x i1> [[TMP16]]) -; AVX512-NEXT: [[TMP46:%.*]] = getelementptr float, float* [[TMP40]], i32 16 +; AVX512-NEXT: [[TMP44:%.*]] = getelementptr float, float* [[A]], i64 [[TMP4]] +; AVX512-NEXT: [[TMP45:%.*]] = getelementptr float, float* [[A]], i64 [[TMP5]] +; AVX512-NEXT: [[TMP46:%.*]] = getelementptr float, float* [[TMP42]], i32 0 ; AVX512-NEXT: [[TMP47:%.*]] = bitcast float* [[TMP46]] to <16 x float>* -; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP37]], <16 x float>* [[TMP47]], i32 4, <16 x i1> [[TMP17]]) -; AVX512-NEXT: [[TMP48:%.*]] = getelementptr float, float* [[TMP40]], i32 32 +; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP38]], <16 x float>* [[TMP47]], i32 4, <16 x i1> [[TMP18]]) +; AVX512-NEXT: [[TMP48:%.*]] = getelementptr float, float* [[TMP42]], i32 16 ; AVX512-NEXT: [[TMP49:%.*]] = bitcast float* [[TMP48]] to <16 x float>* -; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP38]], <16 x float>* [[TMP49]], i32 4, <16 x i1> [[TMP18]]) -; AVX512-NEXT: [[TMP50:%.*]] = getelementptr float, float* [[TMP40]], i32 48 +; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP39]], <16 x float>* [[TMP49]], i32 4, <16 x i1> [[TMP19]]) +; AVX512-NEXT: [[TMP50:%.*]] = getelementptr float, float* [[TMP42]], i32 32 ; AVX512-NEXT: [[TMP51:%.*]] = bitcast float* [[TMP50]] to <16 x float>* -; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP39]], <16 x float>* [[TMP51]], i32 4, <16 x i1> [[TMP19]]) +; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP40]], <16 x float>* [[TMP51]], i32 4, <16 x i1> [[TMP20]]) +; AVX512-NEXT: [[TMP52:%.*]] = getelementptr float, float* [[TMP42]], i32 48 +; AVX512-NEXT: [[TMP53:%.*]] = bitcast float* [[TMP52]] to <16 x float>* +; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP41]], <16 x float>* [[TMP53]], i32 4, <16 x i1> [[TMP21]]) ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64 -; AVX512-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 -; AVX512-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] +; AVX512-NEXT: [[TMP54:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 +; AVX512-NEXT: br i1 [[TMP54]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 9984 ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] @@ -966,50 +966,50 @@ define void @foo2(float* nocapture %A, float* nocapture readonly %B, i32* nocapt ; AVX512-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; AVX512-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; AVX512: vec.epilog.vector.body: -; AVX512-NEXT: [[INDEX19:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT22:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; AVX512-NEXT: [[TMP53:%.*]] = add i64 [[INDEX19]], 0 -; AVX512-NEXT: [[TMP54:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP53]] -; AVX512-NEXT: [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TMP54]], i32 0 -; AVX512-NEXT: [[TMP56:%.*]] = bitcast i32* [[TMP55]] to <8 x i32>* -; AVX512-NEXT: [[WIDE_LOAD20:%.*]] = load <8 x i32>, <8 x i32>* [[TMP56]], align 4 -; AVX512-NEXT: [[TMP57:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD20]], -; AVX512-NEXT: [[TMP58:%.*]] = getelementptr float, float* [[B]], i64 [[TMP53]] -; AVX512-NEXT: [[TMP59:%.*]] = getelementptr float, float* [[TMP58]], i32 0 -; AVX512-NEXT: [[TMP60:%.*]] = bitcast float* [[TMP59]] to <8 x float>* -; AVX512-NEXT: [[WIDE_MASKED_LOAD21:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP60]], i32 4, <8 x i1> [[TMP57]], <8 x float> poison) -; AVX512-NEXT: [[TMP61:%.*]] = sitofp <8 x i32> [[WIDE_LOAD20]] to <8 x float> -; AVX512-NEXT: [[TMP62:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD21]], [[TMP61]] -; AVX512-NEXT: [[TMP63:%.*]] = getelementptr float, float* [[A]], i64 [[TMP53]] -; AVX512-NEXT: [[TMP64:%.*]] = getelementptr float, float* [[TMP63]], i32 0 -; AVX512-NEXT: [[TMP65:%.*]] = bitcast float* [[TMP64]] to <8 x float>* -; AVX512-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP62]], <8 x float>* [[TMP65]], i32 4, <8 x i1> [[TMP57]]) -; AVX512-NEXT: [[INDEX_NEXT22]] = add nuw i64 [[INDEX19]], 8 -; AVX512-NEXT: [[TMP66:%.*]] = icmp eq i64 [[INDEX_NEXT22]], 10000 -; AVX512-NEXT: br i1 [[TMP66]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]] +; AVX512-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; AVX512-NEXT: [[TMP55:%.*]] = add i64 [[OFFSET_IDX]], 0 +; AVX512-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP55]] +; AVX512-NEXT: [[TMP57:%.*]] = getelementptr inbounds i32, i32* [[TMP56]], i32 0 +; AVX512-NEXT: [[TMP58:%.*]] = bitcast i32* [[TMP57]] to <8 x i32>* +; AVX512-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i32>, <8 x i32>* [[TMP58]], align 4 +; AVX512-NEXT: [[TMP59:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD13]], +; AVX512-NEXT: [[TMP60:%.*]] = getelementptr float, float* [[B]], i64 [[TMP55]] +; AVX512-NEXT: [[TMP61:%.*]] = getelementptr float, float* [[TMP60]], i32 0 +; AVX512-NEXT: [[TMP62:%.*]] = bitcast float* [[TMP61]] to <8 x float>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP62]], i32 4, <8 x i1> [[TMP59]], <8 x float> poison) +; AVX512-NEXT: [[TMP63:%.*]] = sitofp <8 x i32> [[WIDE_LOAD13]] to <8 x float> +; AVX512-NEXT: [[TMP64:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD14]], [[TMP63]] +; AVX512-NEXT: [[TMP65:%.*]] = getelementptr float, float* [[A]], i64 [[TMP55]] +; AVX512-NEXT: [[TMP66:%.*]] = getelementptr float, float* [[TMP65]], i32 0 +; AVX512-NEXT: [[TMP67:%.*]] = bitcast float* [[TMP66]] to <8 x float>* +; AVX512-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP64]], <8 x float>* [[TMP67]], i32 4, <8 x i1> [[TMP59]]) +; AVX512-NEXT: [[INDEX_NEXT15]] = add nuw i64 [[OFFSET_IDX]], 8 +; AVX512-NEXT: [[TMP68:%.*]] = icmp eq i64 [[INDEX_NEXT15]], 10000 +; AVX512-NEXT: br i1 [[TMP68]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; AVX512: vec.epilog.middle.block: -; AVX512-NEXT: [[CMP_N18:%.*]] = icmp eq i64 10000, 10000 -; AVX512-NEXT: br i1 [[CMP_N18]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]] +; AVX512-NEXT: [[CMP_N11:%.*]] = icmp eq i64 10000, 10000 +; AVX512-NEXT: br i1 [[CMP_N11]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]] ; AVX512: vec.epilog.scalar.ph: ; AVX512-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; AVX512-NEXT: br label [[FOR_BODY:%.*]] ; AVX512: for.body: ; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] ; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] -; AVX512-NEXT: [[TMP67:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; AVX512-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP67]], 100 +; AVX512-NEXT: [[TMP69:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; AVX512-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP69]], 100 ; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; AVX512: if.then: ; AVX512-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]] -; AVX512-NEXT: [[TMP68:%.*]] = load float, float* [[ARRAYIDX3]], align 4 -; AVX512-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP67]] to float -; AVX512-NEXT: [[ADD:%.*]] = fadd float [[TMP68]], [[CONV]] +; AVX512-NEXT: [[TMP70:%.*]] = load float, float* [[ARRAYIDX3]], align 4 +; AVX512-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP69]] to float +; AVX512-NEXT: [[ADD:%.*]] = fadd float [[TMP70]], [[CONV]] ; AVX512-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]] ; AVX512-NEXT: store float [[ADD]], float* [[ARRAYIDX7]], align 4 ; AVX512-NEXT: br label [[FOR_INC]] ; AVX512: for.inc: ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]] +; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; AVX512: for.end.loopexit: ; AVX512-NEXT: br label [[FOR_END]] ; AVX512: for.end: @@ -1146,7 +1146,7 @@ define void @foo3(double* nocapture %A, double* nocapture readonly %B, i32* noca ; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP39]], <4 x double>* [[TMP51]], i32 8, <4 x i1> [[TMP19]]), !alias.scope !12, !noalias !14 ; AVX-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; AVX-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 -; AVX-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP39:![0-9]+]] +; AVX-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; AVX: middle.block: ; AVX-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 10000 ; AVX-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -1170,7 +1170,7 @@ define void @foo3(double* nocapture %A, double* nocapture readonly %B, i32* noca ; AVX: for.inc: ; AVX-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]] +; AVX-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; AVX: for.end: ; AVX-NEXT: ret void ; @@ -1265,7 +1265,7 @@ define void @foo3(double* nocapture %A, double* nocapture readonly %B, i32* noca ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP39]], <8 x double>* [[TMP51]], i32 8, <8 x i1> [[TMP19]]), !alias.scope !16, !noalias !18 ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; AVX512-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 -; AVX512-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]] +; AVX512-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 9984 ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -1289,7 +1289,7 @@ define void @foo3(double* nocapture %A, double* nocapture readonly %B, i32* noca ; AVX512: for.inc: ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000 -; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]] +; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; AVX512: for.end: ; AVX512-NEXT: ret void ; @@ -1397,7 +1397,7 @@ define void @foo4(double* nocapture %A, double* nocapture readonly %B, i32* noca ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; AVX512-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[VEC_IND]], ; AVX512-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 624 -; AVX512-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP53:![0-9]+]] +; AVX512-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]] ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 625, 624 ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -1422,7 +1422,7 @@ define void @foo4(double* nocapture %A, double* nocapture readonly %B, i32* noca ; AVX512: for.inc: ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 16 ; AVX512-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000 -; AVX512-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP54:![0-9]+]] +; AVX512-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP30:![0-9]+]] ; AVX512: for.end: ; AVX512-NEXT: ret void ; @@ -1676,7 +1676,7 @@ define void @foo6(double* nocapture readonly %in, double* nocapture %out, i32 %s ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[REVERSE35]], <4 x double>* [[TMP59]], i32 8, <4 x i1> [[REVERSE26]]), !alias.scope !22, !noalias !24 ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; AVX2-NEXT: [[TMP60:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 -; AVX2-NEXT: br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP49:![0-9]+]] +; AVX2-NEXT: br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] ; AVX2: middle.block: ; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -1699,7 +1699,7 @@ define void @foo6(double* nocapture readonly %in, double* nocapture %out, i32 %s ; AVX2: for.inc: ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 ; AVX2-NEXT: [[CMP:%.*]] = icmp eq i64 [[INDVARS_IV]], 0 -; AVX2-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP50:![0-9]+]] +; AVX2-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] ; AVX2: for.end: ; AVX2-NEXT: ret void ; @@ -1819,7 +1819,7 @@ define void @foo6(double* nocapture readonly %in, double* nocapture %out, i32 %s ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[REVERSE35]], <8 x double>* [[TMP59]], i32 8, <8 x i1> [[REVERSE26]]), !alias.scope !36, !noalias !38 ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; AVX512-NEXT: [[TMP60:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 -; AVX512-NEXT: br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP63:![0-9]+]] +; AVX512-NEXT: br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP39:![0-9]+]] ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 4096, 4096 ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -1842,7 +1842,7 @@ define void @foo6(double* nocapture readonly %in, double* nocapture %out, i32 %s ; AVX512: for.inc: ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1 ; AVX512-NEXT: [[CMP:%.*]] = icmp eq i64 [[INDVARS_IV]], 0 -; AVX512-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP64:![0-9]+]] +; AVX512-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]] ; AVX512: for.end: ; AVX512-NEXT: ret void ; @@ -1974,7 +1974,7 @@ define void @foo7(double* noalias nocapture %out, double** noalias nocapture rea ; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]]) ; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; AVX1-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX1-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]] +; AVX1-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; AVX1: middle.block: ; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -2000,7 +2000,7 @@ define void @foo7(double* noalias nocapture %out, double** noalias nocapture rea ; AVX1: for.inc: ; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]] +; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; AVX1: for.end.loopexit: ; AVX1-NEXT: br label [[FOR_END]] ; AVX1: for.end: @@ -2098,7 +2098,7 @@ define void @foo7(double* noalias nocapture %out, double** noalias nocapture rea ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]]) ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; AVX2-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX2-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP51:![0-9]+]] +; AVX2-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]] ; AVX2: middle.block: ; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -2124,7 +2124,7 @@ define void @foo7(double* noalias nocapture %out, double** noalias nocapture rea ; AVX2: for.inc: ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP52:![0-9]+]] +; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]] ; AVX2: for.end.loopexit: ; AVX2-NEXT: br label [[FOR_END]] ; AVX2: for.end: @@ -2222,7 +2222,7 @@ define void @foo7(double* noalias nocapture %out, double** noalias nocapture rea ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP63]], i32 8, <8 x i1> [[TMP55]]) ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; AVX512-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX512-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP65:![0-9]+]] +; AVX512-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]] ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -2248,7 +2248,7 @@ define void @foo7(double* noalias nocapture %out, double** noalias nocapture rea ; AVX512: for.inc: ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP66:![0-9]+]] +; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]] ; AVX512: for.end.loopexit: ; AVX512-NEXT: br label [[FOR_END]] ; AVX512: for.end: @@ -2391,7 +2391,7 @@ define void @foo8(double* noalias nocapture %out, i32 ()** noalias nocapture rea ; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]]) ; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; AVX1-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX1-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]] +; AVX1-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; AVX1: middle.block: ; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -2417,7 +2417,7 @@ define void @foo8(double* noalias nocapture %out, i32 ()** noalias nocapture rea ; AVX1: for.inc: ; AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP45:![0-9]+]] +; AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; AVX1: for.end.loopexit: ; AVX1-NEXT: br label [[FOR_END]] ; AVX1: for.end: @@ -2515,7 +2515,7 @@ define void @foo8(double* noalias nocapture %out, i32 ()** noalias nocapture rea ; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP63]], i32 8, <4 x i1> [[TMP55]]) ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; AVX2-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX2-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP54:![0-9]+]] +; AVX2-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] ; AVX2: middle.block: ; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -2541,7 +2541,7 @@ define void @foo8(double* noalias nocapture %out, i32 ()** noalias nocapture rea ; AVX2: for.inc: ; AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP55:![0-9]+]] +; AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]] ; AVX2: for.end.loopexit: ; AVX2-NEXT: br label [[FOR_END]] ; AVX2: for.end: @@ -2639,7 +2639,7 @@ define void @foo8(double* noalias nocapture %out, i32 ()** noalias nocapture rea ; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP63]], i32 8, <8 x i1> [[TMP55]]) ; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; AVX512-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; AVX512-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP67:![0-9]+]] +; AVX512-NEXT: br i1 [[TMP64]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]] ; AVX512: middle.block: ; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -2665,7 +2665,7 @@ define void @foo8(double* noalias nocapture %out, i32 ()** noalias nocapture rea ; AVX512: for.inc: ; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP68:![0-9]+]] +; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]] ; AVX512: for.end.loopexit: ; AVX512-NEXT: br label [[FOR_END]] ; AVX512: for.end: diff --git a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll index 57ddff6..46e84ab 100644 --- a/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/epilog-vectorization-reductions.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -loop-vectorize -force-vector-width=4 -enable-epilogue-vectorization -epilogue-vectorization-force-VF=4 -S | FileCheck %s +; RUN: opt -passes=loop-vectorize -force-vector-width=4 -enable-epilogue-vectorization -epilogue-vectorization-force-VF=4 -S %s | FileCheck %s ; ; Integer reduction with a start value of 5 @@ -44,15 +44,15 @@ define i64 @int_reduction_add(i64* %a, i64 %N) { ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0 ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i64> [ [[TMP7]], [[VEC_EPILOG_PH]] ], [ [[TMP12:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX6]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 0 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, i64* [[TMP9]], i32 0 ; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64* [[TMP10]] to <4 x i64>* ; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i64>, <4 x i64>* [[TMP11]], align 4 ; CHECK-NEXT: [[TMP12]] = add <4 x i64> [[WIDE_LOAD8]], [[VEC_PHI7]] -; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX6]], 4 +; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[OFFSET_IDX]], 4 ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC4]] ; CHECK-NEXT: br i1 [[TMP13]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: vec.epilog.middle.block: @@ -141,16 +141,16 @@ define float @fp_reduction_max(float* noalias %a, i64 %N) { ; CHECK-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x float> [[MINMAX_IDENT_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI7:%.*]] = phi <4 x float> [ [[MINMAX_IDENT_SPLAT]], [[VEC_EPILOG_PH]] ], [ [[TMP13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX6]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 0 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP9]], i32 0 ; CHECK-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to <4 x float>* ; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x float>, <4 x float>* [[TMP11]], align 4 ; CHECK-NEXT: [[TMP12:%.*]] = fcmp fast ogt <4 x float> [[VEC_PHI7]], [[WIDE_LOAD8]] ; CHECK-NEXT: [[TMP13]] = select <4 x i1> [[TMP12]], <4 x float> [[VEC_PHI7]], <4 x float> [[WIDE_LOAD8]] -; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX6]], 4 +; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[OFFSET_IDX]], 4 ; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC4]] ; CHECK-NEXT: br i1 [[TMP14]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: vec.epilog.middle.block: @@ -222,7 +222,7 @@ define i16 @reduction_or_trunc(i16* noalias nocapture %ptr) { ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 ; CHECK-NEXT: [[TMP8:%.*]] = trunc <4 x i32> [[TMP6]] to <4 x i16> ; CHECK-NEXT: [[TMP9]] = zext <4 x i16> [[TMP8]] to <4 x i32> -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP10:%.*]] = trunc <4 x i32> [[TMP9]] to <4 x i16> ; CHECK-NEXT: [[TMP11:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP10]]) @@ -237,9 +237,9 @@ define i16 @reduction_or_trunc(i16* noalias nocapture %ptr) { ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0 ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX3:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ [[TMP13]], [[VEC_EPILOG_PH]] ], [ [[TMP23:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[INDEX3]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[OFFSET_IDX]], 0 ; CHECK-NEXT: [[TMP15:%.*]] = and <4 x i32> [[VEC_PHI4]], ; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i32 [[TMP14]] ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i16, i16* [[TMP16]], i32 0 @@ -247,11 +247,11 @@ define i16 @reduction_or_trunc(i16* noalias nocapture %ptr) { ; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i16>, <4 x i16>* [[TMP18]], align 2 ; CHECK-NEXT: [[TMP19:%.*]] = zext <4 x i16> [[WIDE_LOAD5]] to <4 x i32> ; CHECK-NEXT: [[TMP20:%.*]] = or <4 x i32> [[TMP15]], [[TMP19]] -; CHECK-NEXT: [[INDEX_NEXT6]] = add nuw i32 [[INDEX3]], 4 +; CHECK-NEXT: [[INDEX_NEXT6]] = add nuw i32 [[OFFSET_IDX]], 4 ; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i32 [[INDEX_NEXT6]], 256 ; CHECK-NEXT: [[TMP22:%.*]] = trunc <4 x i32> [[TMP20]] to <4 x i16> ; CHECK-NEXT: [[TMP23]] = zext <4 x i16> [[TMP22]] to <4 x i32> -; CHECK-NEXT: br i1 [[TMP21]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP21]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: [[TMP24:%.*]] = trunc <4 x i32> [[TMP23]] to <4 x i16> ; CHECK-NEXT: [[TMP25:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP24]]) @@ -272,7 +272,7 @@ define i16 @reduction_or_trunc(i16* noalias nocapture %ptr) { ; CHECK-NEXT: [[XOR]] = or i32 [[SUM_02]], [[EXT]] ; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 256 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: for.end.loopexit: ; CHECK-NEXT: [[XOR_LCSSA1:%.*]] = phi i32 [ [[XOR]], [[FOR_BODY]] ], [ [[TMP26]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_END]] @@ -329,7 +329,7 @@ define float @multiple_fp_rdx(float* %A, i64 %N) { ; CHECK-NEXT: [[TMP5]] = fmul fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP4]]) ; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP5]]) @@ -349,19 +349,19 @@ define float @multiple_fp_rdx(float* %A, i64 %N) { ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x float> zeroinitializer, float [[BC_MERGE_RDX3]], i32 0 ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX9:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI10:%.*]] = phi <4 x float> [ [[TMP9]], [[VEC_EPILOG_PH]] ], [ [[TMP16:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI11:%.*]] = phi <4 x float> [ [[TMP10]], [[VEC_EPILOG_PH]] ], [ [[TMP15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX9]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 0 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP11]] ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 0 ; CHECK-NEXT: [[TMP14:%.*]] = bitcast float* [[TMP13]] to <4 x float>* ; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, <4 x float>* [[TMP14]], align 4 ; CHECK-NEXT: [[TMP15]] = fadd fast <4 x float> [[VEC_PHI11]], [[WIDE_LOAD12]] ; CHECK-NEXT: [[TMP16]] = fmul fast <4 x float> [[VEC_PHI10]], [[WIDE_LOAD12]] -; CHECK-NEXT: [[INDEX_NEXT13]] = add nuw i64 [[INDEX9]], 4 +; CHECK-NEXT: [[INDEX_NEXT13]] = add nuw i64 [[OFFSET_IDX]], 4 ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC7]] -; CHECK-NEXT: br i1 [[TMP17]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP17]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: [[TMP18:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP15]]) ; CHECK-NEXT: [[TMP19:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP16]]) @@ -382,7 +382,7 @@ define float @multiple_fp_rdx(float* %A, i64 %N) { ; CHECK-NEXT: [[MUL]] = fmul fast float [[PROD]], [[TMP20]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: for.end.loopexit: ; CHECK-NEXT: [[ADD_LCSSA5:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP18]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: [[MUL_LCSSA4:%.*]] = phi float [ [[MUL]], [[FOR_BODY]] ], [ [[TMP19]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] @@ -444,7 +444,7 @@ define i32 @reduction_phi_start_val(i32* %A, i64 %N) { ; CHECK-NEXT: [[TMP5]] = sub <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] @@ -461,17 +461,17 @@ define i32 @reduction_phi_start_val(i32* %A, i64 %N) { ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0 ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i32> [ [[TMP8]], [[VEC_EPILOG_PH]] ], [ [[TMP13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX6]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 0 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0 ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>* ; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP12]], align 4 ; CHECK-NEXT: [[TMP13]] = sub <4 x i32> [[VEC_PHI7]], [[WIDE_LOAD8]] -; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX6]], 4 +; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[OFFSET_IDX]], 4 ; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC4]] -; CHECK-NEXT: br i1 [[TMP14]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP14]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP13]]) ; CHECK-NEXT: [[CMP_N5:%.*]] = icmp eq i64 [[N]], [[N_VEC4]] @@ -488,7 +488,7 @@ define i32 @reduction_phi_start_val(i32* %A, i64 %N) { ; CHECK-NEXT: [[SUB]] = sub nsw i32 [[SUM]], [[LOAD]] ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: for.cond.loopexit: ; CHECK-NEXT: [[SUB_LCSSA2:%.*]] = phi i32 [ [[SUB]], [[FOR_BODY]] ], [ [[TMP15]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_COND]] diff --git a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll index 1756a44..5d17783 100644 --- a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll +++ b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-liveout.ll @@ -40,7 +40,7 @@ define signext i32 @f1(i32* noalias %A, i32* noalias %B, i32 signext %n) { ; VF-TWO-CHECK-NEXT: [[TMP7:%.*]] = add nsw <2 x i32> [[WIDE_LOAD]], [[WIDE_LOAD2]] ; VF-TWO-CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; VF-TWO-CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VF-TWO-CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] +; VF-TWO-CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; VF-TWO-CHECK: middle.block: ; VF-TWO-CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; VF-TWO-CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[TMP7]], i32 1 @@ -55,24 +55,24 @@ define signext i32 @f1(i32* noalias %A, i32* noalias %B, i32 signext %n) { ; VF-TWO-CHECK-NEXT: [[N_VEC5:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF4]] ; VF-TWO-CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; VF-TWO-CHECK: vec.epilog.vector.body: -; VF-TWO-CHECK-NEXT: [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT7:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; VF-TWO-CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX6]], 0 +; VF-TWO-CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT10:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; VF-TWO-CHECK-NEXT: [[TMP10:%.*]] = add i64 [[OFFSET_IDX]], 0 ; VF-TWO-CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP10]] ; VF-TWO-CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0 ; VF-TWO-CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <2 x i32>* -; VF-TWO-CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <2 x i32>, <2 x i32>* [[TMP13]], align 4 +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <2 x i32>, <2 x i32>* [[TMP13]], align 4 ; VF-TWO-CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP10]] ; VF-TWO-CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i32 0 ; VF-TWO-CHECK-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <2 x i32>* -; VF-TWO-CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <2 x i32>, <2 x i32>* [[TMP16]], align 4 -; VF-TWO-CHECK-NEXT: [[TMP17:%.*]] = add nsw <2 x i32> [[WIDE_LOAD9]], [[WIDE_LOAD10]] -; VF-TWO-CHECK-NEXT: [[INDEX_NEXT7]] = add nuw i64 [[INDEX6]], 2 -; VF-TWO-CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT7]], [[N_VEC5]] -; VF-TWO-CHECK-NEXT: br i1 [[TMP18]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], [[LOOP2:!llvm.loop !.*]] +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <2 x i32>, <2 x i32>* [[TMP16]], align 4 +; VF-TWO-CHECK-NEXT: [[TMP17:%.*]] = add nsw <2 x i32> [[WIDE_LOAD8]], [[WIDE_LOAD9]] +; VF-TWO-CHECK-NEXT: [[INDEX_NEXT10]] = add nuw i64 [[OFFSET_IDX]], 2 +; VF-TWO-CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT10]], [[N_VEC5]] +; VF-TWO-CHECK-NEXT: br i1 [[TMP18]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; VF-TWO-CHECK: vec.epilog.middle.block: -; VF-TWO-CHECK-NEXT: [[CMP_N8:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC5]] +; VF-TWO-CHECK-NEXT: [[CMP_N6:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC5]] ; VF-TWO-CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x i32> [[TMP17]], i32 1 -; VF-TWO-CHECK-NEXT: br i1 [[CMP_N8]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]] +; VF-TWO-CHECK-NEXT: br i1 [[CMP_N6]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]] ; VF-TWO-CHECK: vec.epilog.scalar.ph: ; VF-TWO-CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] ; VF-TWO-CHECK-NEXT: br label [[FOR_BODY:%.*]] @@ -85,7 +85,7 @@ define signext i32 @f1(i32* noalias %A, i32* noalias %B, i32 signext %n) { ; VF-TWO-CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]] ; VF-TWO-CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; VF-TWO-CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; VF-TWO-CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT_LOOPEXIT]], [[LOOP4:!llvm.loop !.*]] +; VF-TWO-CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOP4:![0-9]+]] ; VF-TWO-CHECK: for.end.loopexit.loopexit: ; VF-TWO-CHECK-NEXT: [[ADD_LCSSA3:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP19]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; VF-TWO-CHECK-NEXT: br label [[FOR_END_LOOPEXIT]] -- 2.7.4