-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce
; Check that we don't fall into an infinite loop.
define void @test() nounwind {
entry:
- br label %for.body
+ br label %for.body
for.body:
- %0 = phi i32 [ 1, %entry ], [ 0, %for.body ]
- br label %for.body
+ %0 = phi i32 [ 1, %entry ], [ 0, %for.body ]
+ br label %for.body
}
define void @test2() nounwind {
entry:
- br label %for.body
+ br label %for.body
for.body: ; preds = %for.body, %entry
- %indvars.iv47 = phi i64 [ 0, %entry ], [ %indvars.iv.next48, %for.body ]
- %0 = phi i32 [ 1, %entry ], [ 0, %for.body ]
- %indvars.iv.next48 = add i64 %indvars.iv47, 1
- br i1 undef, label %for.end, label %for.body
+ %indvars.iv47 = phi i64 [ 0, %entry ], [ %indvars.iv.next48, %for.body ]
+ %0 = phi i32 [ 1, %entry ], [ 0, %for.body ]
+ %indvars.iv.next48 = add i64 %indvars.iv47, 1
+ br i1 undef, label %for.end, label %for.body
for.end: ; preds = %for.body
- unreachable
+ unreachable
}
;PR14701
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -loop-vectorize -dce -force-vector-interleave=1 -force-vector-width=4
+; RUN: opt < %s -loop-vectorize -dce -force-vector-interleave=1 -force-vector-width=4
; Check that we don't crash.
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -Oz -S -enable-new-pm=0 < %s | FileCheck %s
; RUN: opt -passes='default<Oz>' -S < %s | FileCheck %s
define void @foo(float* noalias nocapture %ptrA, float* noalias nocapture readonly %ptrB, i64 %size) {
; CHECK-LABEL: @foo(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[EXITCOND1:%.*]] = icmp eq i64 [[SIZE:%.*]], 0
-; CHECK-NEXT: br i1 [[EXITCOND1]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
-; CHECK: for.body.preheader:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SIZE]], 8
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER6:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[SIZE]], -8
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, float* [[PTRB:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[TMP0]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP0]], i64 4
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[PTRA:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP4]], i64 4
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
-; CHECK-NEXT: [[TMP8:%.*]] = fmul <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD4]]
-; CHECK-NEXT: [[TMP9:%.*]] = fmul <4 x float> [[WIDE_LOAD3]], [[WIDE_LOAD5]]
-; CHECK-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP8]], <4 x float>* [[TMP10]], align 4
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP9]], <4 x float>* [[TMP11]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[SIZE]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY_PREHEADER6]]
-; CHECK: for.body.preheader6:
-; CHECK-NEXT: [[INDVARS_IV2_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV2:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[INDVARS_IV2_PH]], [[FOR_BODY_PREHEADER6]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[PTRB]], i64 [[INDVARS_IV2]]
-; CHECK-NEXT: [[TMP13:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[PTRA]], i64 [[INDVARS_IV2]]
-; CHECK-NEXT: [[TMP14:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[MUL3:%.*]] = fmul float [[TMP13]], [[TMP14]]
-; CHECK-NEXT: store float [[MUL3]], float* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV2]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[SIZE]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: ret void
+; CHECK: fmul <4 x float>
;
entry:
br label %for.cond
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -loop-vectorize -mtriple=aarch64-none-linux-gnu -mattr=+neon -S | FileCheck %s
target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
; Function Attrs: nounwind
define i32* @array_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* %c, i32 %size) {
-; CHECK-LABEL: @array_add(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[SIZE:%.*]], 0
-; CHECK-NEXT: br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK: for.body.preheader:
-; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SIZE]], -1
-; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
-; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 8
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 8
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP3]]
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 4
-; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP3]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0
-; CHECK-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
-; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 4
-; CHECK-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP16]], align 4
-; CHECK-NEXT: [[TMP17:%.*]] = add nsw <4 x i32> [[WIDE_LOAD2]], [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP18:%.*]] = add nsw <4 x i32> [[WIDE_LOAD3]], [[WIDE_LOAD1]]
-; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[TMP3]]
-; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP19]], i32 0
-; CHECK-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
-; CHECK-NEXT: store <4 x i32> [[TMP17]], <4 x i32>* [[TMP22]], align 4
-; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[TMP19]], i32 4
-; CHECK-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
-; CHECK-NEXT: store <4 x i32> [[TMP18]], <4 x i32>* [[TMP24]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP26:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP27:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP27]], [[TMP26]]
-; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[SIZE]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: for.end.loopexit:
-; CHECK-NEXT: br label [[FOR_END]]
-; CHECK: for.end:
-; CHECK-NEXT: ret i32* [[C]]
-;
+;CHECK-LABEL: array_add
+;CHECK: load <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret
entry:
%cmp10 = icmp sgt i32 %size, 0
br i1 %cmp10, label %for.body.preheader, label %for.end
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -loop-vectorize -mtriple=arm64-none-linux-gnu -mattr=+neon -S | FileCheck %s
target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
; Function Attrs: nounwind
define i32* @array_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* %c, i32 %size) {
-; CHECK-LABEL: @array_add(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[SIZE:%.*]], 0
-; CHECK-NEXT: br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK: for.body.preheader:
-; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[SIZE]], -1
-; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
-; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 8
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 8
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP3]]
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 4
-; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP3]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0
-; CHECK-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
-; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 4
-; CHECK-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP16]], align 4
-; CHECK-NEXT: [[TMP17:%.*]] = add nsw <4 x i32> [[WIDE_LOAD2]], [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP18:%.*]] = add nsw <4 x i32> [[WIDE_LOAD3]], [[WIDE_LOAD1]]
-; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[TMP3]]
-; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP19]], i32 0
-; CHECK-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
-; CHECK-NEXT: store <4 x i32> [[TMP17]], <4 x i32>* [[TMP22]], align 4
-; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[TMP19]], i32 4
-; CHECK-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
-; CHECK-NEXT: store <4 x i32> [[TMP18]], <4 x i32>* [[TMP24]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP26:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP27:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP27]], [[TMP26]]
-; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[SIZE]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: for.end.loopexit:
-; CHECK-NEXT: br label [[FOR_END]]
-; CHECK: for.end:
-; CHECK-NEXT: ret i32* [[C]]
-;
+;CHECK-LABEL: array_add
+;CHECK: load <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: load <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret
entry:
%cmp10 = icmp sgt i32 %size, 0
br i1 %cmp10, label %for.body.preheader, label %for.end
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -loop-vectorize -force-target-instruction-cost=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S < %s 2>&1 | FileCheck %s
; This test currently fails when the LV calculates a maximums safe
define void @f1(i32* %A) #0 {
-; CHECK-LABEL: @f1(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], 1
-; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 1024, [[TMP4]]
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP5]], i64 1024)
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <vscale x 4 x i32>*
-; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32>* [[TMP8]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
-; CHECK-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], 1024
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[EXIT]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: exit:
-; CHECK-NEXT: ret void
-;
entry:
br label %for.body
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -loop-vectorize -mtriple=arm64-apple-darwin -S %s | FileCheck %s
; Test cases for extending the vectorization factor, if small memory operations
; load 4 x i8, vectorization might still be profitable.
define void @test_load_i8_store_i32(i8* noalias %src, i32* noalias %dst, i32 %off, i64 %N) {
; CHECK-LABEL: @test_load_i8_store_i32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[OFF:%.*]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[OFF]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT2]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <4 x i8>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP5]], align 1
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 4
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP7]], align 1
-; CHECK-NEXT: [[TMP8:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i32>
-; CHECK-NEXT: [[TMP9:%.*]] = zext <4 x i8> [[WIDE_LOAD1]] to <4 x i32>
-; CHECK-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP8]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: [[TMP11:%.*]] = add <4 x i32> [[TMP9]], [[BROADCAST_SPLAT3]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i32 0
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <4 x i32>*
-; CHECK-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP15]], align 4
-; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i32 4
-; CHECK-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
-; CHECK-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* [[TMP17]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[LOOP:%.*]]
-; CHECK: loop:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[IV]]
-; CHECK-NEXT: [[LV:%.*]] = load i8, i8* [[GEP_SRC]], align 1
-; CHECK-NEXT: [[LV_EXT:%.*]] = zext i8 [[LV]] to i32
-; CHECK-NEXT: [[ADD:%.*]] = add i32 [[LV_EXT]], [[OFF]]
-; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 [[IV]]
-; CHECK-NEXT: store i32 [[ADD]], i32* [[GEP_DST]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: exit:
-; CHECK-NEXT: ret void
+; CHECK: <4 x i8>
;
entry:
br label %loop
; Same as test_load_i8_store_i32, but with types flipped for load and store.
define void @test_load_i32_store_i8(i32* noalias %src, i8* noalias %dst, i32 %off, i64 %N) {
; CHECK-LABEL: @test_load_i32_store_i8(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[OFF:%.*]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[OFF]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT2]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 1
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 4
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 1
-; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[WIDE_LOAD1]], [[BROADCAST_SPLAT3]]
-; CHECK-NEXT: [[TMP10:%.*]] = trunc <4 x i32> [[TMP8]] to <4 x i8>
-; CHECK-NEXT: [[TMP11:%.*]] = trunc <4 x i32> [[TMP9]] to <4 x i8>
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[DST:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP12]], i32 0
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <4 x i8>*
-; CHECK-NEXT: store <4 x i8> [[TMP10]], <4 x i8>* [[TMP15]], align 1
-; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, i8* [[TMP12]], i32 4
-; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to <4 x i8>*
-; CHECK-NEXT: store <4 x i8> [[TMP11]], <4 x i8>* [[TMP17]], align 1
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[LOOP:%.*]]
-; CHECK: loop:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 [[IV]]
-; CHECK-NEXT: [[LV:%.*]] = load i32, i32* [[GEP_SRC]], align 1
-; CHECK-NEXT: [[ADD:%.*]] = add i32 [[LV]], [[OFF]]
-; CHECK-NEXT: [[ADD_TRUNC:%.*]] = trunc i32 [[ADD]] to i8
-; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 [[IV]]
-; CHECK-NEXT: store i8 [[ADD_TRUNC]], i8* [[GEP_DST]], align 1
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK: exit:
-; CHECK-NEXT: ret void
+; CHECK: <4 x i8>
;
entry:
br label %loop
; All memory operations use i32, all memory operations are profitable with VF 4.
define void @test_load_i32_store_i32(i32* noalias %src, i32* noalias %dst, i8 %off, i64 %N) {
; CHECK-LABEL: @test_load_i32_store_i32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[OFF:%.*]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i8> poison, i8 [[OFF]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT2]], <4 x i8> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 1
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 4
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 1
-; CHECK-NEXT: [[TMP8:%.*]] = trunc <4 x i32> [[WIDE_LOAD]] to <4 x i8>
-; CHECK-NEXT: [[TMP9:%.*]] = trunc <4 x i32> [[WIDE_LOAD1]] to <4 x i8>
-; CHECK-NEXT: [[TMP10:%.*]] = add <4 x i8> [[TMP8]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: [[TMP11:%.*]] = add <4 x i8> [[TMP9]], [[BROADCAST_SPLAT3]]
-; CHECK-NEXT: [[TMP12:%.*]] = zext <4 x i8> [[TMP10]] to <4 x i32>
-; CHECK-NEXT: [[TMP13:%.*]] = zext <4 x i8> [[TMP11]] to <4 x i32>
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i32 0
-; CHECK-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
-; CHECK-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* [[TMP17]], align 4
-; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i32 4
-; CHECK-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
-; CHECK-NEXT: store <4 x i32> [[TMP13]], <4 x i32>* [[TMP19]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[LOOP:%.*]]
-; CHECK: loop:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 [[IV]]
-; CHECK-NEXT: [[LV:%.*]] = load i32, i32* [[GEP_SRC]], align 1
-; CHECK-NEXT: [[LV_TRUNC:%.*]] = trunc i32 [[LV]] to i8
-; CHECK-NEXT: [[ADD:%.*]] = add i8 [[LV_TRUNC]], [[OFF]]
-; CHECK-NEXT: [[ADD_EXT:%.*]] = zext i8 [[ADD]] to i32
-; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 [[IV]]
-; CHECK-NEXT: store i32 [[ADD_EXT]], i32* [[GEP_DST]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK: exit:
-; CHECK-NEXT: ret void
+; CHECK: vector.body:
+; CHECK: <4 x i32>
;
entry:
br label %loop
; vectorization factor is large. Make sure the register estimates limit the
; vectorization factor.
define void @test_load_i8_store_i64_large(i8* noalias %src, i64* noalias %dst, i64* noalias %dst.2, i64* noalias %dst.3, i64* noalias %dst.4, i64* noalias %dst.5, i64%off, i64 %off.2, i64 %N) {
-; CHECK-LABEL: @test_load_i8_store_i64_large(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 2
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[OFF:%.*]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x i64> poison, i64 [[OFF_2:%.*]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT3]], <2 x i64> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, i64* [[DST_3:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, i64* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast i64* [[TMP3]] to <2 x i64>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, <2 x i64>* [[TMP4]], align 1
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, i64* [[DST_5:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64* [[TMP3]] to <2 x i64>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP6]], align 1
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <2 x i8>*
-; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP8]], align 1
-; CHECK-NEXT: [[TMP9:%.*]] = zext <2 x i8> [[WIDE_LOAD2]] to <2 x i64>
-; CHECK-NEXT: [[TMP10:%.*]] = add <2 x i64> [[TMP9]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: [[TMP11:%.*]] = add <2 x i64> [[TMP10]], [[BROADCAST_SPLAT4]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, i64* [[DST:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, i64* [[DST_2:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP14:%.*]] = add <2 x i64> [[TMP11]], [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP15:%.*]] = add <2 x i64> [[TMP14]], [[TMP10]]
-; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, i64* [[DST_4:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP17:%.*]] = add <2 x i64> [[TMP11]], [[WIDE_LOAD1]]
-; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i64, i64* [[TMP13]], i32 0
-; CHECK-NEXT: [[TMP19:%.*]] = bitcast i64* [[TMP18]] to <2 x i64>*
-; CHECK-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* [[TMP19]], align 4
-; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i64, i64* [[TMP12]], i32 0
-; CHECK-NEXT: [[TMP21:%.*]] = bitcast i64* [[TMP20]] to <2 x i64>*
-; CHECK-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* [[TMP21]], align 4
-; CHECK-NEXT: [[TMP22:%.*]] = bitcast i64* [[TMP3]] to <2 x i64>*
-; CHECK-NEXT: store <2 x i64> [[TMP14]], <2 x i64>* [[TMP22]], align 4
-; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i64, i64* [[TMP16]], i32 0
-; CHECK-NEXT: [[TMP24:%.*]] = bitcast i64* [[TMP23]] to <2 x i64>*
-; CHECK-NEXT: store <2 x i64> [[TMP15]], <2 x i64>* [[TMP24]], align 4
-; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i64, i64* [[TMP5]], i32 0
-; CHECK-NEXT: [[TMP26:%.*]] = bitcast i64* [[TMP25]] to <2 x i64>*
-; CHECK-NEXT: store <2 x i64> [[TMP17]], <2 x i64>* [[TMP26]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[LOOP:%.*]]
-; CHECK: loop:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[IV]]
-; CHECK-NEXT: [[GEP_DST_3:%.*]] = getelementptr inbounds i64, i64* [[DST_3]], i64 [[IV]]
-; CHECK-NEXT: [[LV_DST_3:%.*]] = load i64, i64* [[GEP_DST_3]], align 1
-; CHECK-NEXT: [[GEP_DST_5:%.*]] = getelementptr inbounds i64, i64* [[DST_5]], i64 [[IV]]
-; CHECK-NEXT: [[LV_DST_5:%.*]] = load i64, i64* [[GEP_DST_3]], align 1
-; CHECK-NEXT: [[LV:%.*]] = load i8, i8* [[GEP_SRC]], align 1
-; CHECK-NEXT: [[LV_EXT:%.*]] = zext i8 [[LV]] to i64
-; CHECK-NEXT: [[ADD:%.*]] = add i64 [[LV_EXT]], [[OFF]]
-; CHECK-NEXT: [[ADD_2:%.*]] = add i64 [[ADD]], [[OFF_2]]
-; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i64, i64* [[DST]], i64 [[IV]]
-; CHECK-NEXT: [[GEP_DST_2:%.*]] = getelementptr inbounds i64, i64* [[DST_2]], i64 [[IV]]
-; CHECK-NEXT: [[ADD_3:%.*]] = add i64 [[ADD_2]], [[LV_DST_3]]
-; CHECK-NEXT: [[ADD_4:%.*]] = add i64 [[ADD_3]], [[ADD]]
-; CHECK-NEXT: [[GEP_DST_4:%.*]] = getelementptr inbounds i64, i64* [[DST_4]], i64 [[IV]]
-; CHECK-NEXT: [[ADD_5:%.*]] = add i64 [[ADD_2]], [[LV_DST_5]]
-; CHECK-NEXT: store i64 [[ADD_2]], i64* [[GEP_DST_2]], align 4
-; CHECK-NEXT: store i64 [[ADD]], i64* [[GEP_DST]], align 4
-; CHECK-NEXT: store i64 [[ADD_3]], i64* [[GEP_DST_3]], align 4
-; CHECK-NEXT: store i64 [[ADD_4]], i64* [[GEP_DST_4]], align 4
-; CHECK-NEXT: store i64 [[ADD_5]], i64* [[GEP_DST_5]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK: exit:
-; CHECK-NEXT: ret void
+; CHECK-LABEL: @test_load_i8_store_i64_large
+; CHECK: <2 x i64>
;
entry:
br label %loop
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -mtriple aarch64-unknown-linux-gnu -mattr=+sve -S < %s | FileCheck %s --check-prefix=CHECK-VF4UF1
; RUN: opt -loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -mtriple aarch64-unknown-linux-gnu -mattr=+sve -S < %s | FileCheck %s --check-prefix=CHECK-VF4UF2
; }
;
define i32 @PR33613(double* %b, double %j, i32 %d) #0 {
-; CHECK-VF4UF1-LABEL: @PR33613(
-; CHECK-VF4UF1-NEXT: entry:
-; CHECK-VF4UF1-NEXT: [[IDXPROM:%.*]] = sext i32 [[D:%.*]] to i64
-; CHECK-VF4UF1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-VF4UF1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 10240, [[TMP1]]
-; CHECK-VF4UF1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-VF4UF1: vector.ph:
-; CHECK-VF4UF1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; CHECK-VF4UF1-NEXT: [[N_MOD_VF:%.*]] = urem i64 10240, [[TMP3]]
-; CHECK-VF4UF1-NEXT: [[N_VEC:%.*]] = sub i64 10240, [[N_MOD_VF]]
-; CHECK-VF4UF1-NEXT: [[TMP4:%.*]] = mul i64 [[N_VEC]], 25
-; CHECK-VF4UF1-NEXT: [[IND_END:%.*]] = getelementptr double, double* [[B:%.*]], i64 [[TMP4]]
-; CHECK-VF4UF1-NEXT: [[IND_END2:%.*]] = trunc i64 [[N_VEC]] to i32
-; CHECK-VF4UF1-NEXT: [[TMP5:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF1-NEXT: [[TMP6:%.*]] = mul i32 [[TMP5]], 4
-; CHECK-VF4UF1-NEXT: [[TMP7:%.*]] = sub i32 [[TMP6]], 1
-; CHECK-VF4UF1-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x double> poison, double [[J:%.*]], i32 [[TMP7]]
-; CHECK-VF4UF1-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-VF4UF1: vector.body:
-; CHECK-VF4UF1-NEXT: [[POINTER_PHI:%.*]] = phi double* [ [[B]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4UF1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4UF1-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ insertelement (<vscale x 4 x i32> zeroinitializer, i32 0, i32 0), [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4UF1-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x double> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[WIDE_MASKED_GATHER:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4UF1-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF1-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 4
-; CHECK-VF4UF1-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 1
-; CHECK-VF4UF1-NEXT: [[TMP11:%.*]] = mul i64 25, [[TMP10]]
-; CHECK-VF4UF1-NEXT: [[TMP12:%.*]] = mul i64 [[TMP9]], 0
-; CHECK-VF4UF1-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP12]], i32 0
-; CHECK-VF4UF1-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-VF4UF1-NEXT: [[TMP13:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
-; CHECK-VF4UF1-NEXT: [[TMP14:%.*]] = add <vscale x 4 x i64> [[DOTSPLAT]], [[TMP13]]
-; CHECK-VF4UF1-NEXT: [[VECTOR_GEP:%.*]] = mul <vscale x 4 x i64> [[TMP14]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 25, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4UF1-NEXT: [[TMP15:%.*]] = getelementptr double, double* [[POINTER_PHI]], <vscale x 4 x i64> [[VECTOR_GEP]]
-; CHECK-VF4UF1-NEXT: [[TMP16:%.*]] = getelementptr inbounds double, <vscale x 4 x double*> [[TMP15]], i64 [[IDXPROM]]
-; CHECK-VF4UF1-NEXT: [[WIDE_MASKED_GATHER]] = call <vscale x 4 x double> @llvm.masked.gather.nxv4f64.nxv4p0f64(<vscale x 4 x double*> [[TMP16]], i32 8, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x double> undef)
-; CHECK-VF4UF1-NEXT: [[TMP17:%.*]] = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> [[VECTOR_RECUR]], <vscale x 4 x double> [[WIDE_MASKED_GATHER]], i32 -1)
-; CHECK-VF4UF1-NEXT: [[TMP18:%.*]] = fmul <vscale x 4 x double> [[TMP17]], [[WIDE_MASKED_GATHER]]
-; CHECK-VF4UF1-NEXT: [[TMP19:%.*]] = fcmp une <vscale x 4 x double> [[TMP18]], zeroinitializer
-; CHECK-VF4UF1-NEXT: [[TMP20:%.*]] = zext <vscale x 4 x i1> [[TMP19]] to <vscale x 4 x i32>
-; CHECK-VF4UF1-NEXT: [[TMP21]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP20]]
-; CHECK-VF4UF1-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF1-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 4
-; CHECK-VF4UF1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP23]]
-; CHECK-VF4UF1-NEXT: [[PTR_IND]] = getelementptr double, double* [[POINTER_PHI]], i64 [[TMP11]]
-; CHECK-VF4UF1-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF4UF1-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK-VF4UF1: middle.block:
-; CHECK-VF4UF1-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP21]])
-; CHECK-VF4UF1-NEXT: [[CMP_N:%.*]] = icmp eq i64 10240, [[N_VEC]]
-; CHECK-VF4UF1-NEXT: [[TMP26:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF1-NEXT: [[TMP27:%.*]] = mul i32 [[TMP26]], 4
-; CHECK-VF4UF1-NEXT: [[TMP28:%.*]] = sub i32 [[TMP27]], 1
-; CHECK-VF4UF1-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x double> [[WIDE_MASKED_GATHER]], i32 [[TMP28]]
-; CHECK-VF4UF1-NEXT: [[TMP29:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF1-NEXT: [[TMP30:%.*]] = mul i32 [[TMP29]], 4
-; CHECK-VF4UF1-NEXT: [[TMP31:%.*]] = sub i32 [[TMP30]], 2
-; CHECK-VF4UF1-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <vscale x 4 x double> [[WIDE_MASKED_GATHER]], i32 [[TMP31]]
-; CHECK-VF4UF1-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK-VF4UF1: scalar.ph:
-; CHECK-VF4UF1-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi double [ [[J]], [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF4UF1-NEXT: [[BC_RESUME_VAL:%.*]] = phi double* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[B]], [[ENTRY]] ]
-; CHECK-VF4UF1-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-VF4UF1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF4UF1-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-VF4UF1: for.cond.cleanup:
-; CHECK-VF4UF1-NEXT: [[A_1_LCSSA:%.*]] = phi i32 [ [[A_1:%.*]], [[FOR_BODY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF4UF1-NEXT: ret i32 [[A_1_LCSSA]]
-; CHECK-VF4UF1: for.body:
-; CHECK-VF4UF1-NEXT: [[B_ADDR_012:%.*]] = phi double* [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD_PTR:%.*]], [[FOR_BODY]] ]
-; CHECK-VF4UF1-NEXT: [[I_011:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INC1:%.*]], [[FOR_BODY]] ]
-; CHECK-VF4UF1-NEXT: [[A_010:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[A_1]], [[FOR_BODY]] ]
-; CHECK-VF4UF1-NEXT: [[SCALAR_RECUR:%.*]] = phi double [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP32:%.*]], [[FOR_BODY]] ]
-; CHECK-VF4UF1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[B_ADDR_012]], i64 [[IDXPROM]]
-; CHECK-VF4UF1-NEXT: [[TMP32]] = load double, double* [[ARRAYIDX]], align 8
-; CHECK-VF4UF1-NEXT: [[MUL:%.*]] = fmul double [[SCALAR_RECUR]], [[TMP32]]
-; CHECK-VF4UF1-NEXT: [[TOBOOL:%.*]] = fcmp une double [[MUL]], 0.000000e+00
-; CHECK-VF4UF1-NEXT: [[INC:%.*]] = zext i1 [[TOBOOL]] to i32
-; CHECK-VF4UF1-NEXT: [[A_1]] = add nsw i32 [[A_010]], [[INC]]
-; CHECK-VF4UF1-NEXT: [[INC1]] = add nuw nsw i32 [[I_011]], 1
-; CHECK-VF4UF1-NEXT: [[ADD_PTR]] = getelementptr inbounds double, double* [[B_ADDR_012]], i64 25
-; CHECK-VF4UF1-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC1]], 10240
-; CHECK-VF4UF1-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-;
-; CHECK-VF4UF2-LABEL: @PR33613(
-; CHECK-VF4UF2-NEXT: entry:
-; CHECK-VF4UF2-NEXT: [[IDXPROM:%.*]] = sext i32 [[D:%.*]] to i64
-; CHECK-VF4UF2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF2-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
-; CHECK-VF4UF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 10240, [[TMP1]]
-; CHECK-VF4UF2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-VF4UF2: vector.ph:
-; CHECK-VF4UF2-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF2-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
-; CHECK-VF4UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 10240, [[TMP3]]
-; CHECK-VF4UF2-NEXT: [[N_VEC:%.*]] = sub i64 10240, [[N_MOD_VF]]
-; CHECK-VF4UF2-NEXT: [[TMP4:%.*]] = mul i64 [[N_VEC]], 25
-; CHECK-VF4UF2-NEXT: [[IND_END:%.*]] = getelementptr double, double* [[B:%.*]], i64 [[TMP4]]
-; CHECK-VF4UF2-NEXT: [[IND_END2:%.*]] = trunc i64 [[N_VEC]] to i32
-; CHECK-VF4UF2-NEXT: [[TMP5:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF2-NEXT: [[TMP6:%.*]] = mul i32 [[TMP5]], 4
-; CHECK-VF4UF2-NEXT: [[TMP7:%.*]] = sub i32 [[TMP6]], 1
-; CHECK-VF4UF2-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x double> poison, double [[J:%.*]], i32 [[TMP7]]
-; CHECK-VF4UF2-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-VF4UF2: vector.body:
-; CHECK-VF4UF2-NEXT: [[POINTER_PHI:%.*]] = phi double* [ [[B]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4UF2-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ insertelement (<vscale x 4 x i32> zeroinitializer, i32 0, i32 0), [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4UF2-NEXT: [[VEC_PHI6:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP31:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4UF2-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x double> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[WIDE_MASKED_GATHER7:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4UF2-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF2-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 4
-; CHECK-VF4UF2-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 2
-; CHECK-VF4UF2-NEXT: [[TMP11:%.*]] = mul i64 25, [[TMP10]]
-; CHECK-VF4UF2-NEXT: [[TMP12:%.*]] = mul i64 [[TMP9]], 0
-; CHECK-VF4UF2-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP12]], i32 0
-; CHECK-VF4UF2-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-VF4UF2-NEXT: [[TMP13:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
-; CHECK-VF4UF2-NEXT: [[TMP14:%.*]] = add <vscale x 4 x i64> [[DOTSPLAT]], [[TMP13]]
-; CHECK-VF4UF2-NEXT: [[VECTOR_GEP:%.*]] = mul <vscale x 4 x i64> [[TMP14]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 25, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4UF2-NEXT: [[TMP15:%.*]] = getelementptr double, double* [[POINTER_PHI]], <vscale x 4 x i64> [[VECTOR_GEP]]
-; CHECK-VF4UF2-NEXT: [[TMP16:%.*]] = mul i64 [[TMP9]], 1
-; CHECK-VF4UF2-NEXT: [[DOTSPLATINSERT3:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP16]], i32 0
-; CHECK-VF4UF2-NEXT: [[DOTSPLAT4:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT3]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-VF4UF2-NEXT: [[TMP17:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
-; CHECK-VF4UF2-NEXT: [[TMP18:%.*]] = add <vscale x 4 x i64> [[DOTSPLAT4]], [[TMP17]]
-; CHECK-VF4UF2-NEXT: [[VECTOR_GEP5:%.*]] = mul <vscale x 4 x i64> [[TMP18]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 25, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4UF2-NEXT: [[TMP19:%.*]] = getelementptr double, double* [[POINTER_PHI]], <vscale x 4 x i64> [[VECTOR_GEP5]]
-; CHECK-VF4UF2-NEXT: [[TMP20:%.*]] = getelementptr inbounds double, <vscale x 4 x double*> [[TMP15]], i64 [[IDXPROM]]
-; CHECK-VF4UF2-NEXT: [[TMP21:%.*]] = getelementptr inbounds double, <vscale x 4 x double*> [[TMP19]], i64 [[IDXPROM]]
-; CHECK-VF4UF2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x double> @llvm.masked.gather.nxv4f64.nxv4p0f64(<vscale x 4 x double*> [[TMP20]], i32 8, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x double> undef)
-; CHECK-VF4UF2-NEXT: [[WIDE_MASKED_GATHER7]] = call <vscale x 4 x double> @llvm.masked.gather.nxv4f64.nxv4p0f64(<vscale x 4 x double*> [[TMP21]], i32 8, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x double> undef)
-; CHECK-VF4UF2-NEXT: [[TMP22:%.*]] = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> [[VECTOR_RECUR]], <vscale x 4 x double> [[WIDE_MASKED_GATHER]], i32 -1)
-; CHECK-VF4UF2-NEXT: [[TMP23:%.*]] = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> [[WIDE_MASKED_GATHER]], <vscale x 4 x double> [[WIDE_MASKED_GATHER7]], i32 -1)
-; CHECK-VF4UF2-NEXT: [[TMP24:%.*]] = fmul <vscale x 4 x double> [[TMP22]], [[WIDE_MASKED_GATHER]]
-; CHECK-VF4UF2-NEXT: [[TMP25:%.*]] = fmul <vscale x 4 x double> [[TMP23]], [[WIDE_MASKED_GATHER7]]
-; CHECK-VF4UF2-NEXT: [[TMP26:%.*]] = fcmp une <vscale x 4 x double> [[TMP24]], zeroinitializer
-; CHECK-VF4UF2-NEXT: [[TMP27:%.*]] = fcmp une <vscale x 4 x double> [[TMP25]], zeroinitializer
-; CHECK-VF4UF2-NEXT: [[TMP28:%.*]] = zext <vscale x 4 x i1> [[TMP26]] to <vscale x 4 x i32>
-; CHECK-VF4UF2-NEXT: [[TMP29:%.*]] = zext <vscale x 4 x i1> [[TMP27]] to <vscale x 4 x i32>
-; CHECK-VF4UF2-NEXT: [[TMP30]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP28]]
-; CHECK-VF4UF2-NEXT: [[TMP31]] = add <vscale x 4 x i32> [[VEC_PHI6]], [[TMP29]]
-; CHECK-VF4UF2-NEXT: [[TMP32:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF2-NEXT: [[TMP33:%.*]] = mul i64 [[TMP32]], 8
-; CHECK-VF4UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP33]]
-; CHECK-VF4UF2-NEXT: [[PTR_IND]] = getelementptr double, double* [[POINTER_PHI]], i64 [[TMP11]]
-; CHECK-VF4UF2-NEXT: [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF4UF2-NEXT: br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK-VF4UF2: middle.block:
-; CHECK-VF4UF2-NEXT: [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[TMP31]], [[TMP30]]
-; CHECK-VF4UF2-NEXT: [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
-; CHECK-VF4UF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 10240, [[N_VEC]]
-; CHECK-VF4UF2-NEXT: [[TMP36:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF2-NEXT: [[TMP37:%.*]] = mul i32 [[TMP36]], 4
-; CHECK-VF4UF2-NEXT: [[TMP38:%.*]] = sub i32 [[TMP37]], 1
-; CHECK-VF4UF2-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x double> [[WIDE_MASKED_GATHER7]], i32 [[TMP38]]
-; CHECK-VF4UF2-NEXT: [[TMP39:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF2-NEXT: [[TMP40:%.*]] = mul i32 [[TMP39]], 4
-; CHECK-VF4UF2-NEXT: [[TMP41:%.*]] = sub i32 [[TMP40]], 2
-; CHECK-VF4UF2-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <vscale x 4 x double> [[WIDE_MASKED_GATHER7]], i32 [[TMP41]]
-; CHECK-VF4UF2-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK-VF4UF2: scalar.ph:
-; CHECK-VF4UF2-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi double [ [[J]], [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF4UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi double* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[B]], [[ENTRY]] ]
-; CHECK-VF4UF2-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-VF4UF2-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP35]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF4UF2-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-VF4UF2: for.cond.cleanup:
-; CHECK-VF4UF2-NEXT: [[A_1_LCSSA:%.*]] = phi i32 [ [[A_1:%.*]], [[FOR_BODY]] ], [ [[TMP35]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF4UF2-NEXT: ret i32 [[A_1_LCSSA]]
-; CHECK-VF4UF2: for.body:
-; CHECK-VF4UF2-NEXT: [[B_ADDR_012:%.*]] = phi double* [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD_PTR:%.*]], [[FOR_BODY]] ]
-; CHECK-VF4UF2-NEXT: [[I_011:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INC1:%.*]], [[FOR_BODY]] ]
-; CHECK-VF4UF2-NEXT: [[A_010:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[A_1]], [[FOR_BODY]] ]
-; CHECK-VF4UF2-NEXT: [[SCALAR_RECUR:%.*]] = phi double [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP42:%.*]], [[FOR_BODY]] ]
-; CHECK-VF4UF2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[B_ADDR_012]], i64 [[IDXPROM]]
-; CHECK-VF4UF2-NEXT: [[TMP42]] = load double, double* [[ARRAYIDX]], align 8
-; CHECK-VF4UF2-NEXT: [[MUL:%.*]] = fmul double [[SCALAR_RECUR]], [[TMP42]]
-; CHECK-VF4UF2-NEXT: [[TOBOOL:%.*]] = fcmp une double [[MUL]], 0.000000e+00
-; CHECK-VF4UF2-NEXT: [[INC:%.*]] = zext i1 [[TOBOOL]] to i32
-; CHECK-VF4UF2-NEXT: [[A_1]] = add nsw i32 [[A_010]], [[INC]]
-; CHECK-VF4UF2-NEXT: [[INC1]] = add nuw nsw i32 [[I_011]], 1
-; CHECK-VF4UF2-NEXT: [[ADD_PTR]] = getelementptr inbounds double, double* [[B_ADDR_012]], i64 25
-; CHECK-VF4UF2-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC1]], 10240
-; CHECK-VF4UF2-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-;
+; CHECK-VF4UF2-LABEL: @PR33613
+; CHECK-VF4UF2: vector.body
+; CHECK-VF4UF2: %[[VEC_RECUR:.*]] = phi <vscale x 4 x double> [ {{.*}}, %vector.ph ], [ {{.*}}, %vector.body ]
+; CHECK-VF4UF2: %[[SPLICE1:.*]] = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> %[[VEC_RECUR]], <vscale x 4 x double> {{.*}}, i32 -1)
+; CHECK-VF4UF2-NEXT: %[[SPLICE2:.*]] = call <vscale x 4 x double> @llvm.experimental.vector.splice.nxv4f64(<vscale x 4 x double> %{{.*}}, <vscale x 4 x double> %{{.*}}, i32 -1)
+; CHECK-VF4UF2-NOT: insertelement <vscale x 4 x double>
+; CHECK-VF4UF2: middle.block
entry:
%idxprom = sext i32 %d to i64
br label %for.body
;
; Check that the sext sank after the load in the vector loop.
define void @PR34711([2 x i16]* %a, i32* %b, i32* %c, i64 %n) #0 {
-; CHECK-VF4UF1-LABEL: @PR34711(
-; CHECK-VF4UF1-NEXT: entry:
-; CHECK-VF4UF1-NEXT: [[C1:%.*]] = bitcast i32* [[C:%.*]] to i8*
-; CHECK-VF4UF1-NEXT: [[B3:%.*]] = bitcast i32* [[B:%.*]] to i8*
-; CHECK-VF4UF1-NEXT: [[PRE_INDEX:%.*]] = getelementptr inbounds [2 x i16], [2 x i16]* [[A:%.*]], i64 0, i64 0
-; CHECK-VF4UF1-NEXT: [[DOTPRE:%.*]] = load i16, i16* [[PRE_INDEX]], align 2
-; CHECK-VF4UF1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-VF4UF1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[N:%.*]], [[TMP1]]
-; CHECK-VF4UF1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK-VF4UF1: vector.memcheck:
-; CHECK-VF4UF1-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[C]], i64 [[N]]
-; CHECK-VF4UF1-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
-; CHECK-VF4UF1-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[B]], i64 [[N]]
-; CHECK-VF4UF1-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8*
-; CHECK-VF4UF1-NEXT: [[SCEVGEP6:%.*]] = getelementptr [2 x i16], [2 x i16]* [[A]], i64 0, i64 1
-; CHECK-VF4UF1-NEXT: [[SCEVGEP67:%.*]] = bitcast i16* [[SCEVGEP6]] to i8*
-; CHECK-VF4UF1-NEXT: [[SCEVGEP8:%.*]] = getelementptr [2 x i16], [2 x i16]* [[A]], i64 [[N]], i64 0
-; CHECK-VF4UF1-NEXT: [[SCEVGEP89:%.*]] = bitcast i16* [[SCEVGEP8]] to i8*
-; CHECK-VF4UF1-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[C1]], [[SCEVGEP45]]
-; CHECK-VF4UF1-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[B3]], [[SCEVGEP2]]
-; CHECK-VF4UF1-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-VF4UF1-NEXT: [[BOUND010:%.*]] = icmp ult i8* [[C1]], [[SCEVGEP89]]
-; CHECK-VF4UF1-NEXT: [[BOUND111:%.*]] = icmp ult i8* [[SCEVGEP67]], [[SCEVGEP2]]
-; CHECK-VF4UF1-NEXT: [[FOUND_CONFLICT12:%.*]] = and i1 [[BOUND010]], [[BOUND111]]
-; CHECK-VF4UF1-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT12]]
-; CHECK-VF4UF1-NEXT: [[BOUND013:%.*]] = icmp ult i8* [[B3]], [[SCEVGEP89]]
-; CHECK-VF4UF1-NEXT: [[BOUND114:%.*]] = icmp ult i8* [[SCEVGEP67]], [[SCEVGEP45]]
-; CHECK-VF4UF1-NEXT: [[FOUND_CONFLICT15:%.*]] = and i1 [[BOUND013]], [[BOUND114]]
-; CHECK-VF4UF1-NEXT: [[CONFLICT_RDX16:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT15]]
-; CHECK-VF4UF1-NEXT: br i1 [[CONFLICT_RDX16]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK-VF4UF1: vector.ph:
-; CHECK-VF4UF1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; CHECK-VF4UF1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-VF4UF1-NEXT: [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-VF4UF1-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 [[N_MOD_VF]]
-; CHECK-VF4UF1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[TMP5]]
-; CHECK-VF4UF1-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF1-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 4
-; CHECK-VF4UF1-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1
-; CHECK-VF4UF1-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i16> poison, i16 [[DOTPRE]], i32 [[TMP8]]
-; CHECK-VF4UF1-NEXT: [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
-; CHECK-VF4UF1-NEXT: [[TMP10:%.*]] = add <vscale x 4 x i64> [[TMP9]], zeroinitializer
-; CHECK-VF4UF1-NEXT: [[TMP11:%.*]] = mul <vscale x 4 x i64> [[TMP10]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4UF1-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP11]]
-; CHECK-VF4UF1-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF1-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4
-; CHECK-VF4UF1-NEXT: [[TMP14:%.*]] = mul i64 1, [[TMP13]]
-; CHECK-VF4UF1-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP14]], i32 0
-; CHECK-VF4UF1-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-VF4UF1-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-VF4UF1: vector.body:
-; CHECK-VF4UF1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4UF1-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i16> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[WIDE_MASKED_GATHER:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4UF1-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4UF1-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 0
-; CHECK-VF4UF1-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[TMP15]]
-; CHECK-VF4UF1-NEXT: [[TMP17:%.*]] = getelementptr inbounds [2 x i16], [2 x i16]* [[A]], <vscale x 4 x i64> [[VEC_IND]], i64 1
-; CHECK-VF4UF1-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP16]], i32 0
-; CHECK-VF4UF1-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <vscale x 4 x i32>*
-; CHECK-VF4UF1-NEXT: store <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32>* [[TMP19]], align 4, !alias.scope !4, !noalias !7
-; CHECK-VF4UF1-NEXT: [[WIDE_MASKED_GATHER]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0i16(<vscale x 4 x i16*> [[TMP17]], i32 2, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i16> undef), !alias.scope !10
-; CHECK-VF4UF1-NEXT: [[TMP20:%.*]] = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> [[VECTOR_RECUR]], <vscale x 4 x i16> [[WIDE_MASKED_GATHER]], i32 -1)
-; CHECK-VF4UF1-NEXT: [[TMP21:%.*]] = sext <vscale x 4 x i16> [[TMP20]] to <vscale x 4 x i32>
-; CHECK-VF4UF1-NEXT: [[TMP22:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER]] to <vscale x 4 x i32>
-; CHECK-VF4UF1-NEXT: [[TMP23:%.*]] = mul nsw <vscale x 4 x i32> [[TMP22]], [[TMP21]]
-; CHECK-VF4UF1-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP15]]
-; CHECK-VF4UF1-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP24]], i32 0
-; CHECK-VF4UF1-NEXT: [[TMP26:%.*]] = bitcast i32* [[TMP25]] to <vscale x 4 x i32>*
-; CHECK-VF4UF1-NEXT: store <vscale x 4 x i32> [[TMP23]], <vscale x 4 x i32>* [[TMP26]], align 4, !alias.scope !11, !noalias !10
-; CHECK-VF4UF1-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF1-NEXT: [[TMP28:%.*]] = mul i64 [[TMP27]], 4
-; CHECK-VF4UF1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP28]]
-; CHECK-VF4UF1-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; CHECK-VF4UF1-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF4UF1-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK-VF4UF1: middle.block:
-; CHECK-VF4UF1-NEXT: [[TMP30:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF1-NEXT: [[TMP31:%.*]] = mul i32 [[TMP30]], 4
-; CHECK-VF4UF1-NEXT: [[TMP32:%.*]] = sub i32 [[TMP31]], 1
-; CHECK-VF4UF1-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i16> [[WIDE_MASKED_GATHER]], i32 [[TMP32]]
-; CHECK-VF4UF1-NEXT: [[TMP33:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF1-NEXT: [[TMP34:%.*]] = mul i32 [[TMP33]], 4
-; CHECK-VF4UF1-NEXT: [[TMP35:%.*]] = sub i32 [[TMP34]], 2
-; CHECK-VF4UF1-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <vscale x 4 x i16> [[WIDE_MASKED_GATHER]], i32 [[TMP35]]
-; CHECK-VF4UF1-NEXT: br label [[SCALAR_PH]]
-; CHECK-VF4UF1: scalar.ph:
-; CHECK-VF4UF1-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[DOTPRE]], [[VECTOR_MEMCHECK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF4UF1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-VF4UF1-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-VF4UF1: for.body:
-; CHECK-VF4UF1-NEXT: [[SCALAR_RECUR:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP36:%.*]], [[FOR_BODY]] ]
-; CHECK-VF4UF1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-VF4UF1-NEXT: [[ARRAYCIDX:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]]
-; CHECK-VF4UF1-NEXT: [[CUR_INDEX:%.*]] = getelementptr inbounds [2 x i16], [2 x i16]* [[A]], i64 [[INDVARS_IV]], i64 1
-; CHECK-VF4UF1-NEXT: store i32 7, i32* [[ARRAYCIDX]], align 4
-; CHECK-VF4UF1-NEXT: [[CONV:%.*]] = sext i16 [[SCALAR_RECUR]] to i32
-; CHECK-VF4UF1-NEXT: [[TMP36]] = load i16, i16* [[CUR_INDEX]], align 2
-; CHECK-VF4UF1-NEXT: [[CONV3:%.*]] = sext i16 [[TMP36]] to i32
-; CHECK-VF4UF1-NEXT: [[MUL:%.*]] = mul nsw i32 [[CONV3]], [[CONV]]
-; CHECK-VF4UF1-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
-; CHECK-VF4UF1-NEXT: store i32 [[MUL]], i32* [[ARRAYIDX5]], align 4
-; CHECK-VF4UF1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-VF4UF1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
-; CHECK-VF4UF1-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK-VF4UF1: for.end:
-; CHECK-VF4UF1-NEXT: ret void
-;
-; CHECK-VF4UF2-LABEL: @PR34711(
-; CHECK-VF4UF2-NEXT: entry:
-; CHECK-VF4UF2-NEXT: [[C1:%.*]] = bitcast i32* [[C:%.*]] to i8*
-; CHECK-VF4UF2-NEXT: [[B3:%.*]] = bitcast i32* [[B:%.*]] to i8*
-; CHECK-VF4UF2-NEXT: [[PRE_INDEX:%.*]] = getelementptr inbounds [2 x i16], [2 x i16]* [[A:%.*]], i64 0, i64 0
-; CHECK-VF4UF2-NEXT: [[DOTPRE:%.*]] = load i16, i16* [[PRE_INDEX]], align 2
-; CHECK-VF4UF2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF2-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
-; CHECK-VF4UF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[N:%.*]], [[TMP1]]
-; CHECK-VF4UF2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK-VF4UF2: vector.memcheck:
-; CHECK-VF4UF2-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[C]], i64 [[N]]
-; CHECK-VF4UF2-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
-; CHECK-VF4UF2-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[B]], i64 [[N]]
-; CHECK-VF4UF2-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8*
-; CHECK-VF4UF2-NEXT: [[SCEVGEP6:%.*]] = getelementptr [2 x i16], [2 x i16]* [[A]], i64 0, i64 1
-; CHECK-VF4UF2-NEXT: [[SCEVGEP67:%.*]] = bitcast i16* [[SCEVGEP6]] to i8*
-; CHECK-VF4UF2-NEXT: [[SCEVGEP8:%.*]] = getelementptr [2 x i16], [2 x i16]* [[A]], i64 [[N]], i64 0
-; CHECK-VF4UF2-NEXT: [[SCEVGEP89:%.*]] = bitcast i16* [[SCEVGEP8]] to i8*
-; CHECK-VF4UF2-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[C1]], [[SCEVGEP45]]
-; CHECK-VF4UF2-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[B3]], [[SCEVGEP2]]
-; CHECK-VF4UF2-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-VF4UF2-NEXT: [[BOUND010:%.*]] = icmp ult i8* [[C1]], [[SCEVGEP89]]
-; CHECK-VF4UF2-NEXT: [[BOUND111:%.*]] = icmp ult i8* [[SCEVGEP67]], [[SCEVGEP2]]
-; CHECK-VF4UF2-NEXT: [[FOUND_CONFLICT12:%.*]] = and i1 [[BOUND010]], [[BOUND111]]
-; CHECK-VF4UF2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT12]]
-; CHECK-VF4UF2-NEXT: [[BOUND013:%.*]] = icmp ult i8* [[B3]], [[SCEVGEP89]]
-; CHECK-VF4UF2-NEXT: [[BOUND114:%.*]] = icmp ult i8* [[SCEVGEP67]], [[SCEVGEP45]]
-; CHECK-VF4UF2-NEXT: [[FOUND_CONFLICT15:%.*]] = and i1 [[BOUND013]], [[BOUND114]]
-; CHECK-VF4UF2-NEXT: [[CONFLICT_RDX16:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT15]]
-; CHECK-VF4UF2-NEXT: br i1 [[CONFLICT_RDX16]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK-VF4UF2: vector.ph:
-; CHECK-VF4UF2-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF2-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
-; CHECK-VF4UF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-VF4UF2-NEXT: [[TMP4:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-VF4UF2-NEXT: [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 [[N_MOD_VF]]
-; CHECK-VF4UF2-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[TMP5]]
-; CHECK-VF4UF2-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF2-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], 4
-; CHECK-VF4UF2-NEXT: [[TMP8:%.*]] = sub i32 [[TMP7]], 1
-; CHECK-VF4UF2-NEXT: [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i16> poison, i16 [[DOTPRE]], i32 [[TMP8]]
-; CHECK-VF4UF2-NEXT: [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
-; CHECK-VF4UF2-NEXT: [[TMP10:%.*]] = add <vscale x 4 x i64> [[TMP9]], zeroinitializer
-; CHECK-VF4UF2-NEXT: [[TMP11:%.*]] = mul <vscale x 4 x i64> [[TMP10]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4UF2-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP11]]
-; CHECK-VF4UF2-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF2-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4
-; CHECK-VF4UF2-NEXT: [[TMP14:%.*]] = mul i64 1, [[TMP13]]
-; CHECK-VF4UF2-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP14]], i32 0
-; CHECK-VF4UF2-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-VF4UF2-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-VF4UF2: vector.body:
-; CHECK-VF4UF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4UF2-NEXT: [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i16> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[WIDE_MASKED_GATHER18:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4UF2-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4UF2-NEXT: [[STEP_ADD:%.*]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; CHECK-VF4UF2-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 0
-; CHECK-VF4UF2-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF2-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4
-; CHECK-VF4UF2-NEXT: [[TMP18:%.*]] = add i64 [[TMP17]], 0
-; CHECK-VF4UF2-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 1
-; CHECK-VF4UF2-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], [[TMP19]]
-; CHECK-VF4UF2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[TMP15]]
-; CHECK-VF4UF2-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[TMP20]]
-; CHECK-VF4UF2-NEXT: [[TMP23:%.*]] = getelementptr inbounds [2 x i16], [2 x i16]* [[A]], <vscale x 4 x i64> [[VEC_IND]], i64 1
-; CHECK-VF4UF2-NEXT: [[TMP24:%.*]] = getelementptr inbounds [2 x i16], [2 x i16]* [[A]], <vscale x 4 x i64> [[STEP_ADD]], i64 1
-; CHECK-VF4UF2-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP21]], i32 0
-; CHECK-VF4UF2-NEXT: [[TMP26:%.*]] = bitcast i32* [[TMP25]] to <vscale x 4 x i32>*
-; CHECK-VF4UF2-NEXT: store <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32>* [[TMP26]], align 4, !alias.scope !4, !noalias !7
-; CHECK-VF4UF2-NEXT: [[TMP27:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF2-NEXT: [[TMP28:%.*]] = mul i32 [[TMP27]], 4
-; CHECK-VF4UF2-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[TMP21]], i32 [[TMP28]]
-; CHECK-VF4UF2-NEXT: [[TMP30:%.*]] = bitcast i32* [[TMP29]] to <vscale x 4 x i32>*
-; CHECK-VF4UF2-NEXT: store <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32>* [[TMP30]], align 4, !alias.scope !4, !noalias !7
-; CHECK-VF4UF2-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0i16(<vscale x 4 x i16*> [[TMP23]], i32 2, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i16> undef), !alias.scope !10
-; CHECK-VF4UF2-NEXT: [[WIDE_MASKED_GATHER18]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0i16(<vscale x 4 x i16*> [[TMP24]], i32 2, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i16> undef), !alias.scope !10
-; CHECK-VF4UF2-NEXT: [[TMP31:%.*]] = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> [[VECTOR_RECUR]], <vscale x 4 x i16> [[WIDE_MASKED_GATHER]], i32 -1)
-; CHECK-VF4UF2-NEXT: [[TMP32:%.*]] = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> [[WIDE_MASKED_GATHER]], <vscale x 4 x i16> [[WIDE_MASKED_GATHER18]], i32 -1)
-; CHECK-VF4UF2-NEXT: [[TMP33:%.*]] = sext <vscale x 4 x i16> [[TMP31]] to <vscale x 4 x i32>
-; CHECK-VF4UF2-NEXT: [[TMP34:%.*]] = sext <vscale x 4 x i16> [[TMP32]] to <vscale x 4 x i32>
-; CHECK-VF4UF2-NEXT: [[TMP35:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER]] to <vscale x 4 x i32>
-; CHECK-VF4UF2-NEXT: [[TMP36:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER18]] to <vscale x 4 x i32>
-; CHECK-VF4UF2-NEXT: [[TMP37:%.*]] = mul nsw <vscale x 4 x i32> [[TMP35]], [[TMP33]]
-; CHECK-VF4UF2-NEXT: [[TMP38:%.*]] = mul nsw <vscale x 4 x i32> [[TMP36]], [[TMP34]]
-; CHECK-VF4UF2-NEXT: [[TMP39:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP15]]
-; CHECK-VF4UF2-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP20]]
-; CHECK-VF4UF2-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[TMP39]], i32 0
-; CHECK-VF4UF2-NEXT: [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <vscale x 4 x i32>*
-; CHECK-VF4UF2-NEXT: store <vscale x 4 x i32> [[TMP37]], <vscale x 4 x i32>* [[TMP42]], align 4, !alias.scope !11, !noalias !10
-; CHECK-VF4UF2-NEXT: [[TMP43:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF2-NEXT: [[TMP44:%.*]] = mul i32 [[TMP43]], 4
-; CHECK-VF4UF2-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, i32* [[TMP39]], i32 [[TMP44]]
-; CHECK-VF4UF2-NEXT: [[TMP46:%.*]] = bitcast i32* [[TMP45]] to <vscale x 4 x i32>*
-; CHECK-VF4UF2-NEXT: store <vscale x 4 x i32> [[TMP38]], <vscale x 4 x i32>* [[TMP46]], align 4, !alias.scope !11, !noalias !10
-; CHECK-VF4UF2-NEXT: [[TMP47:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4UF2-NEXT: [[TMP48:%.*]] = mul i64 [[TMP47]], 8
-; CHECK-VF4UF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP48]]
-; CHECK-VF4UF2-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[STEP_ADD]], [[DOTSPLAT]]
-; CHECK-VF4UF2-NEXT: [[TMP49:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF4UF2-NEXT: br i1 [[TMP49]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK-VF4UF2: middle.block:
-; CHECK-VF4UF2-NEXT: [[TMP50:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF2-NEXT: [[TMP51:%.*]] = mul i32 [[TMP50]], 4
-; CHECK-VF4UF2-NEXT: [[TMP52:%.*]] = sub i32 [[TMP51]], 1
-; CHECK-VF4UF2-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i16> [[WIDE_MASKED_GATHER18]], i32 [[TMP52]]
-; CHECK-VF4UF2-NEXT: [[TMP53:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4UF2-NEXT: [[TMP54:%.*]] = mul i32 [[TMP53]], 4
-; CHECK-VF4UF2-NEXT: [[TMP55:%.*]] = sub i32 [[TMP54]], 2
-; CHECK-VF4UF2-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <vscale x 4 x i16> [[WIDE_MASKED_GATHER18]], i32 [[TMP55]]
-; CHECK-VF4UF2-NEXT: br label [[SCALAR_PH]]
-; CHECK-VF4UF2: scalar.ph:
-; CHECK-VF4UF2-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ [[DOTPRE]], [[VECTOR_MEMCHECK]] ], [ [[DOTPRE]], [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF4UF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-VF4UF2-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-VF4UF2: for.body:
-; CHECK-VF4UF2-NEXT: [[SCALAR_RECUR:%.*]] = phi i16 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP56:%.*]], [[FOR_BODY]] ]
-; CHECK-VF4UF2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-VF4UF2-NEXT: [[ARRAYCIDX:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]]
-; CHECK-VF4UF2-NEXT: [[CUR_INDEX:%.*]] = getelementptr inbounds [2 x i16], [2 x i16]* [[A]], i64 [[INDVARS_IV]], i64 1
-; CHECK-VF4UF2-NEXT: store i32 7, i32* [[ARRAYCIDX]], align 4
-; CHECK-VF4UF2-NEXT: [[CONV:%.*]] = sext i16 [[SCALAR_RECUR]] to i32
-; CHECK-VF4UF2-NEXT: [[TMP56]] = load i16, i16* [[CUR_INDEX]], align 2
-; CHECK-VF4UF2-NEXT: [[CONV3:%.*]] = sext i16 [[TMP56]] to i32
-; CHECK-VF4UF2-NEXT: [[MUL:%.*]] = mul nsw i32 [[CONV3]], [[CONV]]
-; CHECK-VF4UF2-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
-; CHECK-VF4UF2-NEXT: store i32 [[MUL]], i32* [[ARRAYIDX5]], align 4
-; CHECK-VF4UF2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-VF4UF2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
-; CHECK-VF4UF2-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK-VF4UF2: for.end:
-; CHECK-VF4UF2-NEXT: ret void
-;
+; CHECK-VF4UF1-LABEL: @PR34711
+; CHECK-VF4UF1: vector.body
+; CHECK-VF4UF1: %[[VEC_RECUR:.*]] = phi <vscale x 4 x i16> [ %vector.recur.init, %vector.ph ], [ %[[MGATHER:.*]], %vector.body ]
+; CHECK-VF4UF1: %[[MGATHER]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0i16(<vscale x 4 x i16*> {{.*}}, i32 2, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i16> undef)
+; CHECK-VF4UF1-NEXT: %[[SPLICE:.*]] = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> %[[VEC_RECUR]], <vscale x 4 x i16> %[[MGATHER]], i32 -1)
+; CHECK-VF4UF1-NEXT: %[[SXT1:.*]] = sext <vscale x 4 x i16> %[[SPLICE]] to <vscale x 4 x i32>
+; CHECK-VF4UF1-NEXT: %[[SXT2:.*]] = sext <vscale x 4 x i16> %[[MGATHER]] to <vscale x 4 x i32>
+; CHECK-VF4UF1-NEXT: mul nsw <vscale x 4 x i32> %[[SXT2]], %[[SXT1]]
entry:
%pre.index = getelementptr inbounds [2 x i16], [2 x i16]* %a, i64 0, i64 0
%.pre = load i16, i16* %pre.index
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -loop-vectorize -mtriple=arm64-apple-ios -S -mcpu=cyclone -enable-interleaved-mem-accesses=false < %s | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
; CHECK-NOT: x float>
define void @_Z4testmm(i64 %size, i64 %offset) {
-; CHECK-LABEL: @_Z4testmm(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP53:%.*]] = icmp eq i64 [[SIZE:%.*]], 0
-; CHECK-NEXT: br i1 [[CMP53]], label [[FOR_END:%.*]], label [[FOR_BODY_LR_PH:%.*]]
-; CHECK: for.body.lr.ph:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[R_057:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[ADD10:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[G_056:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[ADD20:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[V_055:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[B_054:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[ADD30:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ADD:%.*]] = add i64 [[V_055]], [[OFFSET:%.*]]
-; CHECK-NEXT: [[MUL:%.*]] = mul i64 [[ADD]], 3
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[MUL]]
-; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel, i64 0, i64 [[V_055]]
-; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[MUL3:%.*]] = fmul fast float [[TMP0]], [[TMP1]]
-; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel2, i64 0, i64 [[V_055]]
-; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX4]], align 4
-; CHECK-NEXT: [[MUL5:%.*]] = fmul fast float [[MUL3]], [[TMP2]]
-; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel3, i64 0, i64 [[V_055]]
-; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX6]], align 4
-; CHECK-NEXT: [[MUL7:%.*]] = fmul fast float [[MUL5]], [[TMP3]]
-; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel4, i64 0, i64 [[V_055]]
-; CHECK-NEXT: [[TMP4:%.*]] = load float, float* [[ARRAYIDX8]], align 4
-; CHECK-NEXT: [[MUL9:%.*]] = fmul fast float [[MUL7]], [[TMP4]]
-; CHECK-NEXT: [[ADD10]] = fadd fast float [[R_057]], [[MUL9]]
-; CHECK-NEXT: [[ARRAYIDX_SUM:%.*]] = add i64 [[MUL]], 1
-; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[ARRAYIDX_SUM]]
-; CHECK-NEXT: [[TMP5:%.*]] = load float, float* [[ARRAYIDX11]], align 4
-; CHECK-NEXT: [[MUL13:%.*]] = fmul fast float [[TMP1]], [[TMP5]]
-; CHECK-NEXT: [[MUL15:%.*]] = fmul fast float [[TMP2]], [[MUL13]]
-; CHECK-NEXT: [[MUL17:%.*]] = fmul fast float [[TMP3]], [[MUL15]]
-; CHECK-NEXT: [[MUL19:%.*]] = fmul fast float [[TMP4]], [[MUL17]]
-; CHECK-NEXT: [[ADD20]] = fadd fast float [[G_056]], [[MUL19]]
-; CHECK-NEXT: [[ARRAYIDX_SUM52:%.*]] = add i64 [[MUL]], 2
-; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i64 0, i64 [[ARRAYIDX_SUM52]]
-; CHECK-NEXT: [[TMP6:%.*]] = load float, float* [[ARRAYIDX21]], align 4
-; CHECK-NEXT: [[MUL23:%.*]] = fmul fast float [[TMP1]], [[TMP6]]
-; CHECK-NEXT: [[MUL25:%.*]] = fmul fast float [[TMP2]], [[MUL23]]
-; CHECK-NEXT: [[MUL27:%.*]] = fmul fast float [[TMP3]], [[MUL25]]
-; CHECK-NEXT: [[MUL29:%.*]] = fmul fast float [[TMP4]], [[MUL27]]
-; CHECK-NEXT: [[ADD30]] = fadd fast float [[B_054]], [[MUL29]]
-; CHECK-NEXT: [[INC]] = add i64 [[V_055]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INC]], [[SIZE]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]]
-; CHECK: for.cond.for.end_crit_edge:
-; CHECK-NEXT: [[ADD30_LCSSA:%.*]] = phi float [ [[ADD30]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ADD20_LCSSA:%.*]] = phi float [ [[ADD20]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ADD10_LCSSA:%.*]] = phi float [ [[ADD10]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[PHITMP:%.*]] = fptoui float [[ADD10_LCSSA]] to i8
-; CHECK-NEXT: [[PHITMP60:%.*]] = fptoui float [[ADD20_LCSSA]] to i8
-; CHECK-NEXT: [[PHITMP61:%.*]] = fptoui float [[ADD30_LCSSA]] to i8
-; CHECK-NEXT: br label [[FOR_END]]
-; CHECK: for.end:
-; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i8 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[G_0_LCSSA:%.*]] = phi i8 [ [[PHITMP60]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT: [[B_0_LCSSA:%.*]] = phi i8 [ [[PHITMP61]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT: store i8 [[R_0_LCSSA]], i8* @r_, align 1
-; CHECK-NEXT: store i8 [[G_0_LCSSA]], i8* @g_, align 1
-; CHECK-NEXT: store i8 [[B_0_LCSSA]], i8* @b_, align 1
-; CHECK-NEXT: ret void
-;
entry:
%cmp53 = icmp eq i64 %size, 0
br i1 %cmp53, label %for.end, label %for.body.lr.ph
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -mtriple=arm64-apple-darinw -S %s | FileCheck %s
; In the loop below, both the current and previous values of a first-order
; recurrence are stored in an interleave group.
define void @interleaved_store_first_order_recurrence(i32* noalias %src, i32* %dst) {
; CHECK-LABEL: @interleaved_store_first_order_recurrence(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 99>, [[VECTOR_PH]] ], [ [[BROADCAST_SPLAT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 99>, %vector.ph ], [ [[BROADCAST_SPLAT:%.*]], %vector.body ]
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[SRC:%.*]], align 4
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; CHECK-NEXT: [[TMP3:%.*]] = mul nuw nsw i64 [[TMP0]], 3
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 [[TMP3]]
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i64 2
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 -2
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <12 x i32>*
-; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> zeroinitializer, <4 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i32> [[TMP10]], <12 x i32> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
-; CHECK-NEXT: store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[TMP7]], align 4
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i64 2
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 -2
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <12 x i32>*
+; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> zeroinitializer, <4 x i32> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> [[TMP10]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i32> [[TMP11]], <12 x i32> poison, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
+; CHECK-NEXT: store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[TMP8]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[BROADCAST_SPLAT]], i32 3
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[BROADCAST_SPLAT]], i32 2
-; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 99, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT: br label [[LOOP:%.*]]
-; CHECK: loop:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[SCALAR_RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[FOR_NEXT]] = load i32, i32* [[SRC]], align 4
-; CHECK-NEXT: [[OFF:%.*]] = mul nuw nsw i64 [[IV]], 3
-; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 [[OFF]]
-; CHECK-NEXT: store i32 0, i32* [[GEP_1]], align 4
-; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[GEP_1]], i64 1
-; CHECK-NEXT: store i32 [[SCALAR_RECUR]], i32* [[GEP_2]], align 4
-; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[GEP_1]], i64 2
-; CHECK-NEXT: store i32 [[FOR_NEXT]], i32* [[GEP_3]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: exit:
-; CHECK-NEXT: ret void
+; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; CHECK-NEXT: br i1 [[TMP12]], label %middle.block, label %vector.body
;
entry:
br label %loop
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK: vec.epilog.iter.check:
-; CHECK-NEXT: [[IND_END17:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[N_VEC]]
-; CHECK-NEXT: [[IND_END14:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[N_VEC]]
-; CHECK-NEXT: [[CAST_CRD10:%.*]] = trunc i64 [[N_VEC]] to i32
-; CHECK-NEXT: [[IND_END11:%.*]] = sub i32 [[BLOCKSIZE]], [[CAST_CRD10]]
+; CHECK-NEXT: [[IND_END19:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[N_VEC]]
+; CHECK-NEXT: [[IND_END16:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[N_VEC]]
+; CHECK-NEXT: [[CAST_CRD12:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT: [[IND_END13:%.*]] = sub i32 [[BLOCKSIZE]], [[CAST_CRD12]]
; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[TMP2]], 24
; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; CHECK-NEXT: [[N_VEC9:%.*]] = and i64 [[TMP2]], 8589934584
; CHECK-NEXT: [[CAST_CRD:%.*]] = trunc i64 [[N_VEC9]] to i32
; CHECK-NEXT: [[IND_END:%.*]] = sub i32 [[BLOCKSIZE]], [[CAST_CRD]]
-; CHECK-NEXT: [[IND_END13:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[N_VEC9]]
-; CHECK-NEXT: [[IND_END16:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[N_VEC9]]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT23:%.*]] = insertelement <8 x i8> poison, i8 [[OFFSET]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT24:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT23]], <8 x i8> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[IND_END15:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[N_VEC9]]
+; CHECK-NEXT: [[IND_END18:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[N_VEC9]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT25:%.*]] = insertelement <8 x i8> poison, i8 [[OFFSET]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT26:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT25]], <8 x i8> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; CHECK: vec.epilog.vector.body:
-; CHECK-NEXT: [[INDEX19:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT25:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT: [[NEXT_GEP20:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[INDEX19]]
-; CHECK-NEXT: [[NEXT_GEP21:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[INDEX19]]
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8* [[NEXT_GEP20]] to <8 x i8>*
-; CHECK-NEXT: [[WIDE_LOAD22:%.*]] = load <8 x i8>, <8 x i8>* [[TMP12]], align 2
-; CHECK-NEXT: [[TMP13:%.*]] = call <8 x i8> @llvm.umin.v8i8(<8 x i8> [[WIDE_LOAD22]], <8 x i8> [[BROADCAST_SPLAT24]])
-; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8* [[NEXT_GEP21]] to <8 x i8>*
-; CHECK-NEXT: store <8 x i8> [[TMP13]], <8 x i8>* [[TMP14]], align 2
-; CHECK-NEXT: [[INDEX_NEXT25]] = add nuw i64 [[INDEX19]], 8
-; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT25]], [[N_VEC9]]
-; CHECK-NEXT: br i1 [[TMP15]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT: [[INDEX10:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT: [[NEXT_GEP22:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[INDEX10]]
+; CHECK-NEXT: [[NEXT_GEP23:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[INDEX10]]
+; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8* [[NEXT_GEP22]] to <8 x i8>*
+; CHECK-NEXT: [[WIDE_LOAD24:%.*]] = load <8 x i8>, <8 x i8>* [[TMP15]], align 2
+; CHECK-NEXT: [[TMP16:%.*]] = call <8 x i8> @llvm.umin.v8i8(<8 x i8> [[WIDE_LOAD24]], <8 x i8> [[BROADCAST_SPLAT26]])
+; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8* [[NEXT_GEP23]] to <8 x i8>*
+; CHECK-NEXT: store <8 x i8> [[TMP16]], <8 x i8>* [[TMP17]], align 2
+; CHECK-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[INDEX10]], 8
+; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC9]]
+; CHECK-NEXT: br i1 [[TMP18]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: vec.epilog.middle.block:
-; CHECK-NEXT: [[CMP_N18:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC9]]
-; CHECK-NEXT: br i1 [[CMP_N18]], label [[WHILE_END]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK-NEXT: [[CMP_N20:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC9]]
+; CHECK-NEXT: br i1 [[CMP_N20]], label [[WHILE_END]], label [[VEC_EPILOG_SCALAR_PH]]
; CHECK: vec.epilog.scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END11]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ]
-; CHECK-NEXT: [[BC_RESUME_VAL12:%.*]] = phi i8* [ [[IND_END13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END14]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PSRC]], [[ITER_CHECK]] ]
-; CHECK-NEXT: [[BC_RESUME_VAL15:%.*]] = phi i8* [ [[IND_END16]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END17]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PDST]], [[ITER_CHECK]] ]
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END13]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ]
+; CHECK-NEXT: [[BC_RESUME_VAL14:%.*]] = phi i8* [ [[IND_END15]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END16]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PSRC]], [[ITER_CHECK]] ]
+; CHECK-NEXT: [[BC_RESUME_VAL17:%.*]] = phi i8* [ [[IND_END18]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END19]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PDST]], [[ITER_CHECK]] ]
; CHECK-NEXT: br label [[WHILE_BODY:%.*]]
; CHECK: while.body:
; CHECK-NEXT: [[BLKCNT_09:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ]
-; CHECK-NEXT: [[PSRC_ADDR_08:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL12]], [[VEC_EPILOG_SCALAR_PH]] ]
-; CHECK-NEXT: [[PDST_ADDR_07:%.*]] = phi i8* [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL15]], [[VEC_EPILOG_SCALAR_PH]] ]
+; CHECK-NEXT: [[PSRC_ADDR_08:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL14]], [[VEC_EPILOG_SCALAR_PH]] ]
+; CHECK-NEXT: [[PDST_ADDR_07:%.*]] = phi i8* [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL17]], [[VEC_EPILOG_SCALAR_PH]] ]
; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, i8* [[PSRC_ADDR_08]], i64 1
-; CHECK-NEXT: [[TMP16:%.*]] = load i8, i8* [[PSRC_ADDR_08]], align 2
-; CHECK-NEXT: [[TMP17:%.*]] = tail call i8 @llvm.umin.i8(i8 [[TMP16]], i8 [[OFFSET]])
+; CHECK-NEXT: [[TMP19:%.*]] = load i8, i8* [[PSRC_ADDR_08]], align 2
+; CHECK-NEXT: [[TMP20:%.*]] = tail call i8 @llvm.umin.i8(i8 [[TMP19]], i8 [[OFFSET]])
; CHECK-NEXT: [[INCDEC_PTR3]] = getelementptr inbounds i8, i8* [[PDST_ADDR_07]], i64 1
-; CHECK-NEXT: store i8 [[TMP17]], i8* [[PDST_ADDR_07]], align 2
+; CHECK-NEXT: store i8 [[TMP20]], i8* [[PDST_ADDR_07]], align 2
; CHECK-NEXT: [[DEC]] = add i32 [[BLKCNT_09]], -1
; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0
; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; REQUIRES: asserts
; RUN: opt -loop-vectorize -force-vector-interleave=1 -S -debug-only=loop-vectorize < %s 2>%t | FileCheck %s
; RUN: cat %t | FileCheck %s --check-prefix=CHECK-COST
; We should decide this loop is not worth vectorising using fixed width vectors
define void @fixed_width(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i64 %n) #0 {
; CHECK-LABEL: @fixed_width(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i64 [[N:%.*]], 0
-; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK: for.body.preheader:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.cond.cleanup.loopexit:
-; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: ret void
-; CHECK: for.body:
-; CHECK-NEXT: [[I_07:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[I_07]]
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP0]], 0
-; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
-; CHECK: if.then:
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[I_07]]
-; CHECK-NEXT: store i32 2, i32* [[ARRAYIDX1]], align 4
-; CHECK-NEXT: br label [[FOR_INC]]
-; CHECK: for.inc:
-; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_07]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-;
+; CHECK-NOT: vector.body
entry:
%cmp6 = icmp sgt i64 %n, 0
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
define void @scalable(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i64 %n) #0 {
; CHECK-LABEL: @scalable(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i64 [[N:%.*]], 0
-; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK: for.body.preheader:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <vscale x 4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[TMP7]], align 4
-; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP10:%.*]] = xor <vscale x 4 x i1> [[TMP8]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP9]], i32 0
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <vscale x 4 x i32>*
-; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32>* [[TMP12]], i32 4, <vscale x 4 x i1> [[TMP10]])
-; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]]
-; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.cond.cleanup.loopexit:
-; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: ret void
-; CHECK: for.body:
-; CHECK-NEXT: [[I_07:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I_07]]
-; CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP16]], 0
-; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
-; CHECK: if.then:
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I_07]]
-; CHECK-NEXT: store i32 2, i32* [[ARRAYIDX1]], align 4
-; CHECK-NEXT: br label [[FOR_INC]]
-; CHECK: for.inc:
-; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_07]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-;
+; CHECK: vector.body
+; CHECK: call void @llvm.masked.store.nxv4i32.p0nxv4i32
entry:
%cmp6 = icmp sgt i64 %n, 0
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -loop-vectorize -mtriple=arm64-apple-iphones -force-vector-width=4 -force-vector-interleave=1 %s -S | FileCheck %s
; Vectors with i4 elements may not legal with nontemporal stores.
define void @test_i4_store(i4* %ddst) {
-; CHECK-LABEL: @test_i4_store(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[DDST_ADDR:%.*]] = phi i4* [ [[DDST:%.*]], [[ENTRY]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i4, i4* [[DDST_ADDR]], i64 1
-; CHECK-NEXT: store i4 -6, i4* [[DDST_ADDR]], align 4, !nontemporal !0
-; CHECK-NEXT: [[ADD]] = add nuw nsw i32 [[I]], 4
-; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[I]], 4092
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: ret void
+; CHECK-LABEL: define void @test_i4_store(
+; CHECK-NOT: vector.body:
+; CHECK: ret void
;
entry:
br label %for.body
}
define void @test_i8_store(i8* %ddst) {
-; CHECK-LABEL: @test_i8_store(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, i8* [[DDST:%.*]], i64 1024
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[DDST]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, i8* [[NEXT_GEP]], i32 0
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <4 x i8>*
-; CHECK-NEXT: store <4 x i8> <i8 10, i8 10, i8 10, i8 10>, <4 x i8>* [[TMP2]], align 4, !nontemporal !0
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, 1024
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i8* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[DDST]], [[ENTRY]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[DDST_ADDR:%.*]] = phi i8* [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, i8* [[DDST_ADDR]], i64 1
-; CHECK-NEXT: store i8 10, i8* [[DDST_ADDR]], align 4, !nontemporal !0
-; CHECK-NEXT: [[ADD]] = add nuw nsw i32 [[I]], 4
-; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[I]], 4092
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: ret void
+; CHECK-LABEL: define void @test_i8_store(
+; CHECK-LABEL: vector.body:
+; CHECK: store <4 x i8> {{.*}} !nontemporal !0
+; CHECK: br
;
entry:
br label %for.body
}
define void @test_half_store(half* %ddst) {
-; CHECK-LABEL: @test_half_store(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[IND_END:%.*]] = getelementptr half, half* [[DDST:%.*]], i64 1024
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr half, half* [[DDST]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr half, half* [[NEXT_GEP]], i32 0
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast half* [[TMP1]] to <4 x half>*
-; CHECK-NEXT: store <4 x half> <half 0xH4900, half 0xH4900, half 0xH4900, half 0xH4900>, <4 x half>* [[TMP2]], align 4, !nontemporal !0
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, 1024
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi half* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[DDST]], [[ENTRY]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[DDST_ADDR:%.*]] = phi half* [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds half, half* [[DDST_ADDR]], i64 1
-; CHECK-NEXT: store half 0xH4900, half* [[DDST_ADDR]], align 4, !nontemporal !0
-; CHECK-NEXT: [[ADD]] = add nuw nsw i32 [[I]], 4
-; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[I]], 4092
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: ret void
+; CHECK-LABEL: define void @test_half_store(
+; CHECK-LABEL: vector.body:
+; CHECK: store <4 x half> {{.*}} !nontemporal !0
+; CHECK: br
;
entry:
br label %for.body
}
define void @test_i16_store(i16* %ddst) {
-; CHECK-LABEL: @test_i16_store(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i16, i16* [[DDST:%.*]], i64 1024
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i16, i16* [[DDST]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i16, i16* [[NEXT_GEP]], i32 0
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[TMP1]] to <4 x i16>*
-; CHECK-NEXT: store <4 x i16> <i16 10, i16 10, i16 10, i16 10>, <4 x i16>* [[TMP2]], align 4, !nontemporal !0
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, 1024
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i16* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[DDST]], [[ENTRY]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[DDST_ADDR:%.*]] = phi i16* [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i16, i16* [[DDST_ADDR]], i64 1
-; CHECK-NEXT: store i16 10, i16* [[DDST_ADDR]], align 4, !nontemporal !0
-; CHECK-NEXT: [[ADD]] = add nuw nsw i32 [[I]], 4
-; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[I]], 4092
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: ret void
+; CHECK-LABEL: define void @test_i16_store(
+; CHECK-LABEL: vector.body:
+; CHECK: store <4 x i16> {{.*}} !nontemporal !0
+; CHECK: br
;
entry:
br label %for.body
}
define void @test_i32_store(i32* nocapture %ddst) {
-; CHECK-LABEL: @test_i32_store(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i32, i32* [[DDST:%.*]], i64 4096
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i32, i32* [[DDST]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[NEXT_GEP]], i64 3
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 -3
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <16 x i32>*
-; CHECK-NEXT: store <16 x i32> <i32 10, i32 20, i32 30, i32 40, i32 10, i32 20, i32 30, i32 40, i32 10, i32 20, i32 30, i32 40, i32 10, i32 20, i32 30, i32 40>, <16 x i32>* [[TMP4]], align 4, !nontemporal !0
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, 1024
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[DDST]], [[ENTRY]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[DDST_ADDR:%.*]] = phi i32* [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INCDEC_PTR3:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[DDST_ADDR]], i64 1
-; CHECK-NEXT: store i32 10, i32* [[DDST_ADDR]], align 4, !nontemporal !0
-; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DDST_ADDR]], i64 2
-; CHECK-NEXT: store i32 20, i32* [[INCDEC_PTR]], align 4, !nontemporal !0
-; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[DDST_ADDR]], i64 3
-; CHECK-NEXT: store i32 30, i32* [[INCDEC_PTR1]], align 4, !nontemporal !0
-; CHECK-NEXT: [[INCDEC_PTR3]] = getelementptr inbounds i32, i32* [[DDST_ADDR]], i64 4
-; CHECK-NEXT: store i32 40, i32* [[INCDEC_PTR2]], align 4, !nontemporal !0
-; CHECK-NEXT: [[ADD]] = add nuw nsw i32 [[I]], 4
-; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[I]], 4092
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: ret void
+; CHECK-LABEL: define void @test_i32_store(
+; CHECK-LABEL: vector.body:
+; CHECK: store <16 x i32> {{.*}} !nontemporal !0
+; CHECK: br
;
entry:
br label %for.body
}
define void @test_i33_store(i33* nocapture %ddst) {
-; CHECK-LABEL: @test_i33_store(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[DDST_ADDR:%.*]] = phi i33* [ [[DDST:%.*]], [[ENTRY]] ], [ [[INCDEC_PTR3:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i33, i33* [[DDST_ADDR]], i64 1
-; CHECK-NEXT: store i33 10, i33* [[DDST_ADDR]], align 4, !nontemporal !0
-; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i33, i33* [[DDST_ADDR]], i64 2
-; CHECK-NEXT: store i33 20, i33* [[INCDEC_PTR]], align 4, !nontemporal !0
-; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i33, i33* [[DDST_ADDR]], i64 3
-; CHECK-NEXT: store i33 30, i33* [[INCDEC_PTR1]], align 4, !nontemporal !0
-; CHECK-NEXT: [[INCDEC_PTR3]] = getelementptr inbounds i33, i33* [[DDST_ADDR]], i64 4
-; CHECK-NEXT: store i33 40, i33* [[INCDEC_PTR2]], align 4, !nontemporal !0
-; CHECK-NEXT: [[ADD]] = add nuw nsw i32 [[I]], 3
-; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[I]], 4092
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: ret void
+; CHECK-LABEL: define void @test_i33_store(
+; CHECK-NOT: vector.body:
+; CHECK: ret
;
entry:
br label %for.body
}
define void @test_i40_store(i40* nocapture %ddst) {
-; CHECK-LABEL: @test_i40_store(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[DDST_ADDR:%.*]] = phi i40* [ [[DDST:%.*]], [[ENTRY]] ], [ [[INCDEC_PTR3:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i40, i40* [[DDST_ADDR]], i64 1
-; CHECK-NEXT: store i40 10, i40* [[DDST_ADDR]], align 4, !nontemporal !0
-; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i40, i40* [[DDST_ADDR]], i64 2
-; CHECK-NEXT: store i40 20, i40* [[INCDEC_PTR]], align 4, !nontemporal !0
-; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i40, i40* [[DDST_ADDR]], i64 3
-; CHECK-NEXT: store i40 30, i40* [[INCDEC_PTR1]], align 4, !nontemporal !0
-; CHECK-NEXT: [[INCDEC_PTR3]] = getelementptr inbounds i40, i40* [[DDST_ADDR]], i64 4
-; CHECK-NEXT: store i40 40, i40* [[INCDEC_PTR2]], align 4, !nontemporal !0
-; CHECK-NEXT: [[ADD]] = add nuw nsw i32 [[I]], 3
-; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[I]], 4092
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: ret void
+; CHECK-LABEL: define void @test_i40_store(
+; CHECK-NOT: vector.body:
+; CHECK: ret
;
entry:
br label %for.body
ret void
}
define void @test_i64_store(i64* nocapture %ddst) local_unnamed_addr #0 {
-; CHECK-LABEL: @test_i64_store(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i64, i64* [[DDST:%.*]], i64 1024
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i64, i64* [[DDST]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i64, i64* [[NEXT_GEP]], i32 0
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64* [[TMP1]] to <4 x i64>*
-; CHECK-NEXT: store <4 x i64> <i64 10, i64 10, i64 10, i64 10>, <4 x i64>* [[TMP2]], align 4, !nontemporal !0
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, 1024
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[DDST]], [[ENTRY]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[DDST_ADDR:%.*]] = phi i64* [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i64, i64* [[DDST_ADDR]], i64 1
-; CHECK-NEXT: store i64 10, i64* [[DDST_ADDR]], align 4, !nontemporal !0
-; CHECK-NEXT: [[ADD]] = add nuw nsw i32 [[I]], 4
-; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[I]], 4092
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: ret void
+; CHECK-LABEL: define void @test_i64_store(
+; CHECK-LABEL: vector.body:
+; CHECK: store <4 x i64> {{.*}} !nontemporal !0
+; CHECK: br
;
entry:
br label %for.body
}
define void @test_double_store(double* %ddst) {
-; CHECK-LABEL: @test_double_store(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[IND_END:%.*]] = getelementptr double, double* [[DDST:%.*]], i64 1024
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr double, double* [[DDST]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, double* [[NEXT_GEP]], i32 0
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[TMP1]] to <4 x double>*
-; CHECK-NEXT: store <4 x double> <double 1.000000e+01, double 1.000000e+01, double 1.000000e+01, double 1.000000e+01>, <4 x double>* [[TMP2]], align 4, !nontemporal !0
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, 1024
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi double* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[DDST]], [[ENTRY]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[DDST_ADDR:%.*]] = phi double* [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds double, double* [[DDST_ADDR]], i64 1
-; CHECK-NEXT: store double 1.000000e+01, double* [[DDST_ADDR]], align 4, !nontemporal !0
-; CHECK-NEXT: [[ADD]] = add nuw nsw i32 [[I]], 4
-; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[I]], 4092
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP14:![0-9]+]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: ret void
+; CHECK-LABEL: define void @test_double_store(
+; CHECK-LABEL: vector.body:
+; CHECK: store <4 x double> {{.*}} !nontemporal !0
+; CHECK: br
;
entry:
br label %for.body
}
define void @test_i128_store(i128* %ddst) {
-; CHECK-LABEL: @test_i128_store(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i128, i128* [[DDST:%.*]], i64 1024
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i128, i128* [[DDST]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i128, i128* [[NEXT_GEP]], i32 0
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast i128* [[TMP1]] to <4 x i128>*
-; CHECK-NEXT: store <4 x i128> <i128 10, i128 10, i128 10, i128 10>, <4 x i128>* [[TMP2]], align 4, !nontemporal !0
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, 1024
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i128* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[DDST]], [[ENTRY]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[DDST_ADDR:%.*]] = phi i128* [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i128, i128* [[DDST_ADDR]], i64 1
-; CHECK-NEXT: store i128 10, i128* [[DDST_ADDR]], align 4, !nontemporal !0
-; CHECK-NEXT: [[ADD]] = add nuw nsw i32 [[I]], 4
-; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[I]], 4092
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP16:![0-9]+]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: ret void
+; CHECK-LABEL: define void @test_i128_store(
+; CHECK-LABEL: vector.body:
+; CHECK: store <4 x i128> {{.*}} !nontemporal !0
+; CHECK: br
;
entry:
br label %for.body
}
define void @test_i256_store(i256* %ddst) {
-; CHECK-LABEL: @test_i256_store(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[DDST_ADDR:%.*]] = phi i256* [ [[DDST:%.*]], [[ENTRY]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i256, i256* [[DDST_ADDR]], i64 1
-; CHECK-NEXT: store i256 10, i256* [[DDST_ADDR]], align 4, !nontemporal !0
-; CHECK-NEXT: [[ADD]] = add nuw nsw i32 [[I]], 4
-; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[I]], 4092
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: ret void
+; CHECK-LABEL: define void @test_i256_store(
+; CHECK-NOT: vector.body:
+; CHECK: ret void
;
entry:
br label %for.body
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -mtriple=aarch64 -loop-vectorize -force-vector-width=2 < %s | FileCheck %s
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64--linux-gnu"
define i32 @fn1() local_unnamed_addr #0 {
; We expect the backend to expand all reductions.
-; CHECK-LABEL: @fn1(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* @b, align 4, !tbaa [[TBAA1:![0-9]+]]
-; CHECK-NEXT: [[CMP40:%.*]] = icmp sgt i32 [[TMP0]], 0
-; CHECK-NEXT: br i1 [[CMP40]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
-; CHECK: for.body.lr.ph:
-; CHECK-NEXT: [[TMP1:%.*]] = load i16*, i16** @a, align 8, !tbaa [[TBAA5:![0-9]+]]
-; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* @b, align 4, !tbaa [[TBAA1]]
-; CHECK-NEXT: [[TMP3:%.*]] = sext i32 [[TMP2]] to i64
-; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[TMP3]], i64 1)
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], 4
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[SMAX]], 4
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[SMAX]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i16> [ undef, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i16> [ undef, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <2 x i16> [ undef, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <2 x i16> [ undef, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP1]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[TMP1]], i64 [[TMP5]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, i16* [[TMP6]], i32 0
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16* [[TMP8]] to <2 x i16>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i16>, <2 x i16>* [[TMP9]], align 2, !tbaa [[TBAA7:![0-9]+]]
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, i16* [[TMP6]], i32 2
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to <2 x i16>*
-; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <2 x i16>, <2 x i16>* [[TMP11]], align 2, !tbaa [[TBAA7]]
-; CHECK-NEXT: [[TMP12:%.*]] = icmp sgt <2 x i16> [[VEC_PHI2]], [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP13:%.*]] = icmp sgt <2 x i16> [[VEC_PHI3]], [[WIDE_LOAD4]]
-; CHECK-NEXT: [[TMP14]] = select <2 x i1> [[TMP12]], <2 x i16> [[VEC_PHI2]], <2 x i16> [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP15]] = select <2 x i1> [[TMP13]], <2 x i16> [[VEC_PHI3]], <2 x i16> [[WIDE_LOAD4]]
-; CHECK-NEXT: [[TMP16:%.*]] = icmp slt <2 x i16> [[VEC_PHI]], [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP17:%.*]] = icmp slt <2 x i16> [[VEC_PHI1]], [[WIDE_LOAD4]]
-; CHECK-NEXT: [[TMP18]] = select <2 x i1> [[TMP16]], <2 x i16> [[VEC_PHI]], <2 x i16> [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP19]] = select <2 x i1> [[TMP17]], <2 x i16> [[VEC_PHI1]], <2 x i16> [[WIDE_LOAD4]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[RDX_MINMAX_CMP5:%.*]] = icmp sgt <2 x i16> [[TMP14]], [[TMP15]]
-; CHECK-NEXT: [[RDX_MINMAX_SELECT6:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP5]], <2 x i16> [[TMP14]], <2 x i16> [[TMP15]]
-; CHECK-NEXT: [[TMP21:%.*]] = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> [[RDX_MINMAX_SELECT6]])
-; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp slt <2 x i16> [[TMP18]], [[TMP19]]
-; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP]], <2 x i16> [[TMP18]], <2 x i16> [[TMP19]]
-; CHECK-NEXT: [[TMP22:%.*]] = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> [[RDX_MINMAX_SELECT]])
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_LR_PH]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i16 [ undef, [[FOR_BODY_LR_PH]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX7:%.*]] = phi i16 [ undef, [[FOR_BODY_LR_PH]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[D_043:%.*]] = phi i16 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[DOTSINK28:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[C_042:%.*]] = phi i16 [ [[BC_MERGE_RDX7]], [[SCALAR_PH]] ], [ [[C_0_:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[TMP1]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP23:%.*]] = load i16, i16* [[ARRAYIDX]], align 2, !tbaa [[TBAA7]]
-; CHECK-NEXT: [[CMP2:%.*]] = icmp sgt i16 [[C_042]], [[TMP23]]
-; CHECK-NEXT: [[C_0_]] = select i1 [[CMP2]], i16 [[C_042]], i16 [[TMP23]]
-; CHECK-NEXT: [[CMP13:%.*]] = icmp slt i16 [[D_043]], [[TMP23]]
-; CHECK-NEXT: [[DOTSINK28]] = select i1 [[CMP13]], i16 [[D_043]], i16 [[TMP23]]
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[CMP:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT]], [[TMP3]]
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK: for.end.loopexit:
-; CHECK-NEXT: [[C_0__LCSSA:%.*]] = phi i16 [ [[C_0_]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: [[DOTSINK28_LCSSA:%.*]] = phi i16 [ [[DOTSINK28]], [[FOR_BODY]] ], [ [[TMP22]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR_END]]
-; CHECK: for.end:
-; CHECK-NEXT: [[C_0_LCSSA:%.*]] = phi i16 [ undef, [[ENTRY:%.*]] ], [ [[C_0__LCSSA]], [[FOR_END_LOOPEXIT]] ]
-; CHECK-NEXT: [[D_0_LCSSA:%.*]] = phi i16 [ undef, [[ENTRY]] ], [ [[DOTSINK28_LCSSA]], [[FOR_END_LOOPEXIT]] ]
-; CHECK-NEXT: [[CMP26:%.*]] = icmp sgt i16 [[C_0_LCSSA]], [[D_0_LCSSA]]
-; CHECK-NEXT: [[CONV27:%.*]] = zext i1 [[CMP26]] to i32
-; CHECK-NEXT: ret i32 [[CONV27]]
-;
+; CHECK: @llvm.vector.reduce
entry:
%0 = load i32, i32* @b, align 4, !tbaa !1
%cmp40 = icmp sgt i32 %0, 0
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -loop-vectorize -mattr=+sve -mtriple aarch64-unknown-linux-gnu -force-vector-width=2 -pass-remarks-analysis=loop-vectorize -pass-remarks-missed=loop-vectorize < %s 2>%t | FileCheck %s
; RUN: FileCheck %s --check-prefix=CHECK-REMARKS < %t
; CHECK-REMARKS: Instruction with invalid costs prevented vectorization at VF=(vscale x 1): store
define void @alloca(i32** %vla, i64 %N) {
; CHECK-LABEL: @alloca(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 2
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[INDUCTION:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[INDUCTION1:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT: [[TMP0:%.*]] = alloca i32, align 16
-; CHECK-NEXT: [[TMP1:%.*]] = alloca i32, align 16
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32*, i32** [[VLA:%.*]], i64 [[INDUCTION]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32*, i32** [[VLA]], i64 [[INDUCTION1]]
-; CHECK-NEXT: store i32* [[TMP0]], i32** [[TMP2]], align 8
-; CHECK-NEXT: store i32* [[TMP1]], i32** [[TMP3]], align 8
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[ALLOCA:%.*]] = alloca i32, align 16
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32*, i32** [[VLA]], i64 [[IV]]
-; CHECK-NEXT: store i32* [[ALLOCA]], i32** [[ARRAYIDX]], align 8
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: call void @foo(i32** nonnull [[VLA]])
-; CHECK-NEXT: ret void
-;
+; CHECK-NOT: <vscale x
entry:
br label %for.body
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -loop-vectorize -force-vector-interleave=1 -instcombine -mattr=+sve -mtriple aarch64-unknown-linux-gnu \
; RUN: -pass-remarks-missed=loop-vectorize < %s 2>%t | FileCheck %s
; RUN: cat %t | FileCheck %s --check-prefix=CHECK-REMARKS
; RUN: cat %t | FileCheck %s --check-prefix=CHECK-REMARKS
define void @vec_load(i64 %N, double* nocapture %a, double* nocapture readonly %b) {
-; CHECK-LABEL: @vec_load(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i64 [[N:%.*]], 0
-; CHECK-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK: for.body.preheader:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 1
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK: vector.memcheck:
-; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr double, double* [[A:%.*]], i64 [[N]]
-; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr double, double* [[B:%.*]], i64 [[N]]
-; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt double* [[SCEVGEP4]], [[A]]
-; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt double* [[SCEVGEP]], [[B]]
-; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 1
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP4]] to <vscale x 2 x double>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x double>, <vscale x 2 x double>* [[TMP5]], align 8, !alias.scope !5
-; CHECK-NEXT: [[TMP6:%.*]] = call <vscale x 2 x double> @foo_vec(<vscale x 2 x double> [[WIDE_LOAD]])
-; CHECK-NEXT: [[TMP7:%.*]] = fadd <vscale x 2 x double> [[TMP6]], shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 1.000000e+00, i32 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast double* [[TMP8]] to <vscale x 2 x double>*
-; CHECK-NEXT: store <vscale x 2 x double> [[TMP7]], <vscale x 2 x double>* [[TMP9]], align 8, !alias.scope !8, !noalias !5
-; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP11:%.*]] = shl i64 [[TMP10]], 1
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
-; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[IV]]
-; CHECK-NEXT: [[TMP13:%.*]] = load double, double* [[ARRAYIDX]], align 8
-; CHECK-NEXT: [[TMP14:%.*]] = call double @foo(double [[TMP13]]) #[[ATTR5:[0-9]+]]
-; CHECK-NEXT: [[ADD:%.*]] = fadd double [[TMP14]], 1.000000e+00
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[IV]]
-; CHECK-NEXT: store double [[ADD]], double* [[ARRAYIDX2]], align 8
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK: for.end.loopexit:
-; CHECK-NEXT: br label [[FOR_END]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK-LABEL: @vec_load
+; CHECK: vector.body:
+; CHECK: %[[LOAD:.*]] = load <vscale x 2 x double>, <vscale x 2 x double>*
+; CHECK: call <vscale x 2 x double> @foo_vec(<vscale x 2 x double> %[[LOAD]])
entry:
%cmp7 = icmp sgt i64 %N, 0
br i1 %cmp7, label %for.body, label %for.end
}
define void @vec_scalar(i64 %N, double* nocapture %a) {
-; CHECK-LABEL: @vec_scalar(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i64 [[N:%.*]], 0
-; CHECK-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK: for.body.preheader:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 1
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 1
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 2 x double> @foo_vec(<vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 1.000000e+01, i32 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer))
-; CHECK-NEXT: [[TMP5:%.*]] = fsub <vscale x 2 x double> [[TMP4]], shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 1.000000e+00, i32 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[TMP6]] to <vscale x 2 x double>*
-; CHECK-NEXT: store <vscale x 2 x double> [[TMP5]], <vscale x 2 x double>* [[TMP7]], align 8
-; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP9:%.*]] = shl i64 [[TMP8]], 1
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
-; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[TMP11:%.*]] = call double @foo(double 1.000000e+01) #[[ATTR5]]
-; CHECK-NEXT: [[SUB:%.*]] = fadd double [[TMP11]], -1.000000e+00
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[IV]]
-; CHECK-NEXT: store double [[SUB]], double* [[ARRAYIDX]], align 8
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-; CHECK: for.end.loopexit:
-; CHECK-NEXT: br label [[FOR_END]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK-LABEL: @vec_scalar
+; CHECK: vector.body:
+; CHECK: call <vscale x 2 x double> @foo_vec(<vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 1.000000e+01, i32 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer))
entry:
%cmp7 = icmp sgt i64 %N, 0
br i1 %cmp7, label %for.body, label %for.end
}
define void @vec_ptr(i64 %N, i64* noalias %a, i64** readnone %b) {
-; CHECK-LABEL: @vec_ptr(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i64 [[N:%.*]], 0
-; CHECK-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK: for.body.preheader:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 1
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], 1024
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 1
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub nsw i64 1024, [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i64*, i64** [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i64** [[TMP4]] to <vscale x 2 x i64*>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64*>, <vscale x 2 x i64*>* [[TMP5]], align 8
-; CHECK-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i64> @bar_vec(<vscale x 2 x i64*> [[WIDE_LOAD]])
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64* [[TMP7]] to <vscale x 2 x i64>*
-; CHECK-NEXT: store <vscale x 2 x i64> [[TMP6]], <vscale x 2 x i64>* [[TMP8]], align 4
-; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP10:%.*]] = shl i64 [[TMP9]], 1
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64*, i64** [[B]], i64 [[IV]]
-; CHECK-NEXT: [[LOAD:%.*]] = load i64*, i64** [[GEP]], align 8
-; CHECK-NEXT: [[CALL:%.*]] = call i64 @bar(i64* [[LOAD]]) #[[ATTR6:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[IV]]
-; CHECK-NEXT: store i64 [[CALL]], i64* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-; CHECK: for.end.loopexit:
-; CHECK-NEXT: br label [[FOR_END]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK-LABEL: @vec_ptr
+; CHECK: vector.body:
+; CHECK: %[[LOAD:.*]] = load <vscale x 2 x i64*>, <vscale x 2 x i64*>*
+; CHECK: call <vscale x 2 x i64> @bar_vec(<vscale x 2 x i64*> %[[LOAD]])
entry:
%cmp7 = icmp sgt i64 %N, 0
br i1 %cmp7, label %for.body, label %for.end
}
define void @vec_intrinsic(i64 %N, double* nocapture readonly %a) {
-; CHECK-LABEL: @vec_intrinsic(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i64 [[N:%.*]], 0
-; CHECK-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK: for.body.preheader:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 1
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 1
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP4]] to <vscale x 2 x double>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x double>, <vscale x 2 x double>* [[TMP5]], align 8
-; CHECK-NEXT: [[TMP6:%.*]] = call fast <vscale x 2 x double> @sin_vec_nxv2f64(<vscale x 2 x double> [[WIDE_LOAD]])
-; CHECK-NEXT: [[TMP7:%.*]] = fadd fast <vscale x 2 x double> [[TMP6]], shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double 1.000000e+00, i32 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP4]] to <vscale x 2 x double>*
-; CHECK-NEXT: store <vscale x 2 x double> [[TMP7]], <vscale x 2 x double>* [[TMP8]], align 8
-; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP10:%.*]] = shl i64 [[TMP9]], 1
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[IV]]
-; CHECK-NEXT: [[TMP12:%.*]] = load double, double* [[ARRAYIDX]], align 8
-; CHECK-NEXT: [[TMP13:%.*]] = call fast double @llvm.sin.f64(double [[TMP12]]) #[[ATTR7:[0-9]+]]
-; CHECK-NEXT: [[ADD:%.*]] = fadd fast double [[TMP13]], 1.000000e+00
-; CHECK-NEXT: store double [[ADD]], double* [[ARRAYIDX]], align 8
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
-; CHECK: for.end.loopexit:
-; CHECK-NEXT: br label [[FOR_END]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK-LABEL: @vec_intrinsic
+; CHECK: vector.body:
+; CHECK: %[[LOAD:.*]] = load <vscale x 2 x double>, <vscale x 2 x double>*
+; CHECK: call fast <vscale x 2 x double> @sin_vec_nxv2f64(<vscale x 2 x double> %[[LOAD]])
entry:
%cmp7 = icmp sgt i64 %N, 0
br i1 %cmp7, label %for.body, label %for.end
; CHECK-REMARKS-NEXT: t.c:3:20: Instruction with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin.f32
; CHECK-REMARKS-NEXT: t.c:3:30: Instruction with invalid costs prevented vectorization at VF=(vscale x 1): store
define void @vec_sin_no_mapping(float* noalias nocapture %dst, float* noalias nocapture readonly %src, i64 %n) {
-; CHECK-LABEL: @vec_sin_no_mapping(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 2
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -2
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[TMP0]] to <2 x float>*, !dbg [[DBG20:![0-9]+]]
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4, !dbg [[DBG20]]
-; CHECK-NEXT: [[TMP2:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[WIDE_LOAD]]), !dbg [[DBG23:![0-9]+]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[TMP3]] to <2 x float>*, !dbg [[DBG24:![0-9]+]]
-; CHECK-NEXT: store <2 x float> [[TMP2]], <2 x float>* [[TMP4]], align 4, !dbg [[DBG24]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I_07:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 [[I_07]]
-; CHECK-NEXT: [[TMP6:%.*]] = load float, float* [[ARRAYIDX]], align 4, !dbg [[DBG20]]
-; CHECK-NEXT: [[TMP7:%.*]] = tail call fast float @llvm.sin.f32(float [[TMP6]]), !dbg [[DBG23]]
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[DST]], i64 [[I_07]]
-; CHECK-NEXT: store float [[TMP7]], float* [[ARRAYIDX1]], align 4, !dbg [[DBG24]]
-; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_07]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: ret void
-;
+; CHECK: @vec_sin_no_mapping
+; CHECK: call fast <2 x float> @llvm.sin.v2f32
+; CHECK-NOT: <vscale x
entry:
br label %for.body
; CHECK-REMARKS-NEXT: t.c:3:20: Instruction with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin.f32
; CHECK-REMARKS-NEXT: t.c:3:40: Instruction with invalid costs prevented vectorization at VF=(vscale x 1): store
define void @vec_sin_no_mapping_ite(float* noalias nocapture %dst, float* noalias nocapture readonly %src, i64 %n) {
+; CHECK: @vec_sin_no_mapping_ite
+; CHECK-NOT: <vscale x
+; CHECK: ret
entry:
br label %for.body
; CHECK-REMARKS-NEXT: t.c:3:20: Instruction with invalid costs prevented vectorization at VF=(vscale x 1, vscale x 2): call to llvm.sin.f32
; CHECK-REMARKS-NEXT: t.c:3:30: Instruction with invalid costs prevented vectorization at VF=(vscale x 1): store
define void @vec_sin_fixed_mapping(float* noalias nocapture %dst, float* noalias nocapture readonly %src, i64 %n) {
+; CHECK: @vec_sin_fixed_mapping
+; CHECK: call fast <2 x float> @llvm.sin.v2f32
+; CHECK-NOT: <vscale x
entry:
br label %for.body
; in the loop below we can still vectorize the loop because SVE has
; hardware support in the form of the 'fqsrt' instruction.
define void @vec_sqrt_no_mapping(float* noalias nocapture %dst, float* noalias nocapture readonly %src, i64 %n) #0 {
+; CHECK: @vec_sqrt_no_mapping
+; CHECK: call fast <vscale x 2 x float> @llvm.sqrt.nxv2f32
entry:
br label %for.body
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -loop-vectorize -S | FileCheck %s
; RUN: opt < %s -loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -S | FileCheck %s
; Future implementation of llvm.vp could allow this to happen
define void @predication_in_loop(i32* %a, i32* %b, i32* %cond) #0 {
+; CHECK-LABEL: @predication_in_loop
+; CHECK-NOT: sdiv <vscale x 4 x i32>
+;
entry:
br label %for.body
; "Max legal vector width too small, scalable vectorization unfeasible.."
define void @unpredicated_loop_predication_through_tailfolding(i32* %a, i32* %b) #0 {
-; CHECK-LABEL: @unpredicated_loop_predication_through_tailfolding(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[A1:%.*]] = bitcast i32* [[A:%.*]] to i8*
-; CHECK-NEXT: [[B3:%.*]] = bitcast i32* [[B:%.*]] to i8*
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK: vector.memcheck:
-; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A]], i64 1032
-; CHECK-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
-; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[B]], i64 1024
-; CHECK-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8*
-; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP45]]
-; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[B3]], [[SCEVGEP2]]
-; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4, !alias.scope !11, !noalias !14
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4, !alias.scope !14
-; CHECK-NEXT: [[TMP7:%.*]] = sdiv <4 x i32> [[WIDE_LOAD6]], [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP8:%.*]] = add nuw nsw i64 [[TMP0]], 8
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP8]]
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
-; CHECK-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP11]], align 4, !alias.scope !11, !noalias !14
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, 1024
-; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT: br label [[LOOP:%.*]]
-; CHECK: loop:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
-; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[IV]]
-; CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[SDIV:%.*]] = sdiv i32 [[TMP14]], [[TMP13]]
-; CHECK-NEXT: [[TMP15:%.*]] = add nuw nsw i64 [[IV]], 8
-; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP15]]
-; CHECK-NEXT: store i32 [[SDIV]], i32* [[ARRAYIDX5]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
-; CHECK: exit:
-; CHECK-NEXT: ret void
-;
+; CHECK-LABEL: @unpredicated_loop_predication_through_tailfolding
+; CHECK-NOT: sdiv <vscale x 4 x i32>
entry:
br label %loop
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -loop-vectorize -pass-remarks=loop-vectorize -pass-remarks-analysis=loop-vectorize -pass-remarks-missed=loop-vectorize -mtriple aarch64-unknown-linux-gnu -mattr=+sve,+bf16 -S 2>%t | FileCheck %s -check-prefix=CHECK
; RUN: cat %t | FileCheck %s -check-prefix=CHECK-REMARK
; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
define i32 @add(i32* nocapture %a, i32* nocapture readonly %b, i64 %n) {
-; CHECK-LABEL: @add(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ insertelement (<vscale x 8 x i32> zeroinitializer, i32 2, i32 0), [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP9]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <vscale x 8 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i32>, <vscale x 8 x i32>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], 8
-; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 [[TMP15]]
-; CHECK-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <vscale x 8 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i32>, <vscale x 8 x i32>* [[TMP17]], align 4
-; CHECK-NEXT: [[TMP18]] = add <vscale x 8 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP19]] = add <vscale x 8 x i32> [[WIDE_LOAD2]], [[VEC_PHI1]]
-; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 16
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP21]]
-; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[BIN_RDX:%.*]] = add <vscale x 8 x i32> [[TMP19]], [[TMP18]]
-; CHECK-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[BIN_RDX]])
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, [[ENTRY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[SUM_07:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
-; CHECK-NEXT: [[TMP24:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[ADD]] = add nsw i32 [[TMP24]], [[SUM_07]]
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: ret i32 [[ADD_LCSSA]]
-;
+; CHECK-LABEL: @add
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x i32>
+; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x i32>
+; CHECK: %[[ADD1:.*]] = add <vscale x 8 x i32> %[[LOAD1]]
+; CHECK: %[[ADD2:.*]] = add <vscale x 8 x i32> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[ADD:.*]] = add <vscale x 8 x i32> %[[ADD2]], %[[ADD1]]
+; CHECK-NEXT: call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> %[[ADD]])
entry:
br label %for.body
; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
define i32 @or(i32* nocapture %a, i32* nocapture readonly %b, i64 %n) {
-; CHECK-LABEL: @or(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ insertelement (<vscale x 8 x i32> zeroinitializer, i32 2, i32 0), [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP9]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <vscale x 8 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i32>, <vscale x 8 x i32>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], 8
-; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 [[TMP15]]
-; CHECK-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <vscale x 8 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i32>, <vscale x 8 x i32>* [[TMP17]], align 4
-; CHECK-NEXT: [[TMP18]] = or <vscale x 8 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP19]] = or <vscale x 8 x i32> [[WIDE_LOAD2]], [[VEC_PHI1]]
-; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 16
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP21]]
-; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[BIN_RDX:%.*]] = or <vscale x 8 x i32> [[TMP19]], [[TMP18]]
-; CHECK-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.or.nxv8i32(<vscale x 8 x i32> [[BIN_RDX]])
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, [[ENTRY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[SUM_07:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[OR:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
-; CHECK-NEXT: [[TMP24:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[OR]] = or i32 [[TMP24]], [[SUM_07]]
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: [[OR_LCSSA:%.*]] = phi i32 [ [[OR]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: ret i32 [[OR_LCSSA]]
-;
+; CHECK-LABEL: @or
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x i32>
+; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x i32>
+; CHECK: %[[OR1:.*]] = or <vscale x 8 x i32> %[[LOAD1]]
+; CHECK: %[[OR2:.*]] = or <vscale x 8 x i32> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[OR:.*]] = or <vscale x 8 x i32> %[[OR2]], %[[OR1]]
+; CHECK-NEXT: call i32 @llvm.vector.reduce.or.nxv8i32(<vscale x 8 x i32> %[[OR]])
entry:
br label %for.body
; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
define i32 @and(i32* nocapture %a, i32* nocapture readonly %b, i64 %n) {
-; CHECK-LABEL: @and(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ insertelement (<vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 -1, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer), i32 2, i32 0), [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 -1, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP9]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <vscale x 8 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i32>, <vscale x 8 x i32>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], 8
-; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 [[TMP15]]
-; CHECK-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <vscale x 8 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i32>, <vscale x 8 x i32>* [[TMP17]], align 4
-; CHECK-NEXT: [[TMP18]] = and <vscale x 8 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP19]] = and <vscale x 8 x i32> [[WIDE_LOAD2]], [[VEC_PHI1]]
-; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 16
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP21]]
-; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[BIN_RDX:%.*]] = and <vscale x 8 x i32> [[TMP19]], [[TMP18]]
-; CHECK-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.and.nxv8i32(<vscale x 8 x i32> [[BIN_RDX]])
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, [[ENTRY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[SUM_07:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[AND:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
-; CHECK-NEXT: [[TMP24:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[AND]] = and i32 [[TMP24]], [[SUM_07]]
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: [[AND_LCSSA:%.*]] = phi i32 [ [[AND]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: ret i32 [[AND_LCSSA]]
-;
+; CHECK-LABEL: @and
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x i32>
+; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x i32>
+; CHECK: %[[AND1:.*]] = and <vscale x 8 x i32> %[[LOAD1]]
+; CHECK: %[[AND2:.*]] = and <vscale x 8 x i32> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[ABD:.*]] = and <vscale x 8 x i32> %[[ADD2]], %[[AND1]]
+; CHECK-NEXT: call i32 @llvm.vector.reduce.and.nxv8i32(<vscale x 8 x i32> %[[ADD]])
entry:
br label %for.body
; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
define i32 @xor(i32* nocapture %a, i32* nocapture readonly %b, i64 %n) {
-; CHECK-LABEL: @xor(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ insertelement (<vscale x 8 x i32> zeroinitializer, i32 2, i32 0), [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP9]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <vscale x 8 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i32>, <vscale x 8 x i32>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], 8
-; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 [[TMP15]]
-; CHECK-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <vscale x 8 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i32>, <vscale x 8 x i32>* [[TMP17]], align 4
-; CHECK-NEXT: [[TMP18]] = xor <vscale x 8 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP19]] = xor <vscale x 8 x i32> [[WIDE_LOAD2]], [[VEC_PHI1]]
-; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 16
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP21]]
-; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[BIN_RDX:%.*]] = xor <vscale x 8 x i32> [[TMP19]], [[TMP18]]
-; CHECK-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.xor.nxv8i32(<vscale x 8 x i32> [[BIN_RDX]])
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, [[ENTRY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[SUM_07:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[XOR:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
-; CHECK-NEXT: [[TMP24:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[XOR]] = xor i32 [[TMP24]], [[SUM_07]]
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: [[XOR_LCSSA:%.*]] = phi i32 [ [[XOR]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: ret i32 [[XOR_LCSSA]]
-;
+; CHECK-LABEL: @xor
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x i32>
+; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x i32>
+; CHECK: %[[XOR1:.*]] = xor <vscale x 8 x i32> %[[LOAD1]]
+; CHECK: %[[XOR2:.*]] = xor <vscale x 8 x i32> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[XOR:.*]] = xor <vscale x 8 x i32> %[[XOR2]], %[[XOR1]]
+; CHECK-NEXT: call i32 @llvm.vector.reduce.xor.nxv8i32(<vscale x 8 x i32> %[[XOR]])
entry:
br label %for.body
; SMIN
define i32 @smin(i32* nocapture %a, i32* nocapture readonly %b, i64 %n) {
-; CHECK-LABEL: @smin(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 2, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 2, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP9]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <vscale x 8 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i32>, <vscale x 8 x i32>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], 8
-; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 [[TMP15]]
-; CHECK-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <vscale x 8 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i32>, <vscale x 8 x i32>* [[TMP17]], align 4
-; CHECK-NEXT: [[TMP18:%.*]] = icmp slt <vscale x 8 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP19:%.*]] = icmp slt <vscale x 8 x i32> [[WIDE_LOAD2]], [[VEC_PHI1]]
-; CHECK-NEXT: [[TMP20]] = select <vscale x 8 x i1> [[TMP18]], <vscale x 8 x i32> [[WIDE_LOAD]], <vscale x 8 x i32> [[VEC_PHI]]
-; CHECK-NEXT: [[TMP21]] = select <vscale x 8 x i1> [[TMP19]], <vscale x 8 x i32> [[WIDE_LOAD2]], <vscale x 8 x i32> [[VEC_PHI1]]
-; CHECK-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 16
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP23]]
-; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp slt <vscale x 8 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <vscale x 8 x i1> [[RDX_MINMAX_CMP]], <vscale x 8 x i32> [[TMP20]], <vscale x 8 x i32> [[TMP21]]
-; CHECK-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.smin.nxv8i32(<vscale x 8 x i32> [[RDX_MINMAX_SELECT]])
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, [[ENTRY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[SUM_010:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[DOTSROA_SPECULATED:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
-; CHECK-NEXT: [[TMP26:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP26]], [[SUM_010]]
-; CHECK-NEXT: [[DOTSROA_SPECULATED]] = select i1 [[CMP_I]], i32 [[TMP26]], i32 [[SUM_010]]
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: [[DOTSROA_SPECULATED_LCSSA:%.*]] = phi i32 [ [[DOTSROA_SPECULATED]], [[FOR_BODY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: ret i32 [[DOTSROA_SPECULATED_LCSSA]]
-;
+; CHECK-LABEL: @smin
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x i32>
+; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x i32>
+; CHECK: %[[ICMP1:.*]] = icmp slt <vscale x 8 x i32> %[[LOAD1]]
+; CHECK: %[[ICMP2:.*]] = icmp slt <vscale x 8 x i32> %[[LOAD2]]
+; CHECK: %[[SEL1:.*]] = select <vscale x 8 x i1> %[[ICMP1]], <vscale x 8 x i32> %[[LOAD1]]
+; CHECK: %[[SEL2:.*]] = select <vscale x 8 x i1> %[[ICMP2]], <vscale x 8 x i32> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[ICMP:.*]] = icmp slt <vscale x 8 x i32> %[[SEL1]], %[[SEL2]]
+; CHECK-NEXT: %[[SEL:.*]] = select <vscale x 8 x i1> %[[ICMP]], <vscale x 8 x i32> %[[SEL1]], <vscale x 8 x i32> %[[SEL2]]
+; CHECK-NEXT: call i32 @llvm.vector.reduce.smin.nxv8i32(<vscale x 8 x i32> %[[SEL]])
entry:
br label %for.body
; UMAX
define i32 @umax(i32* nocapture %a, i32* nocapture readonly %b, i64 %n) {
-; CHECK-LABEL: @umax(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 2, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 2, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP9]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <vscale x 8 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i32>, <vscale x 8 x i32>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], 8
-; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 [[TMP15]]
-; CHECK-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <vscale x 8 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i32>, <vscale x 8 x i32>* [[TMP17]], align 4
-; CHECK-NEXT: [[TMP18:%.*]] = icmp ugt <vscale x 8 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP19:%.*]] = icmp ugt <vscale x 8 x i32> [[WIDE_LOAD2]], [[VEC_PHI1]]
-; CHECK-NEXT: [[TMP20]] = select <vscale x 8 x i1> [[TMP18]], <vscale x 8 x i32> [[WIDE_LOAD]], <vscale x 8 x i32> [[VEC_PHI]]
-; CHECK-NEXT: [[TMP21]] = select <vscale x 8 x i1> [[TMP19]], <vscale x 8 x i32> [[WIDE_LOAD2]], <vscale x 8 x i32> [[VEC_PHI1]]
-; CHECK-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 16
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP23]]
-; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp ugt <vscale x 8 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <vscale x 8 x i1> [[RDX_MINMAX_CMP]], <vscale x 8 x i32> [[TMP20]], <vscale x 8 x i32> [[TMP21]]
-; CHECK-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.umax.nxv8i32(<vscale x 8 x i32> [[RDX_MINMAX_SELECT]])
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, [[ENTRY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[SUM_010:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[DOTSROA_SPECULATED:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
-; CHECK-NEXT: [[TMP26:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[CMP_I:%.*]] = icmp ugt i32 [[TMP26]], [[SUM_010]]
-; CHECK-NEXT: [[DOTSROA_SPECULATED]] = select i1 [[CMP_I]], i32 [[TMP26]], i32 [[SUM_010]]
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: [[DOTSROA_SPECULATED_LCSSA:%.*]] = phi i32 [ [[DOTSROA_SPECULATED]], [[FOR_BODY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: ret i32 [[DOTSROA_SPECULATED_LCSSA]]
-;
+; CHECK-LABEL: @umax
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x i32>
+; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x i32>
+; CHECK: %[[ICMP1:.*]] = icmp ugt <vscale x 8 x i32> %[[LOAD1]]
+; CHECK: %[[ICMP2:.*]] = icmp ugt <vscale x 8 x i32> %[[LOAD2]]
+; CHECK: %[[SEL1:.*]] = select <vscale x 8 x i1> %[[ICMP1]], <vscale x 8 x i32> %[[LOAD1]]
+; CHECK: %[[SEL2:.*]] = select <vscale x 8 x i1> %[[ICMP2]], <vscale x 8 x i32> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[ICMP:.*]] = icmp ugt <vscale x 8 x i32> %[[SEL1]], %[[SEL2]]
+; CHECK-NEXT: %[[SEL:.*]] = select <vscale x 8 x i1> %[[ICMP]], <vscale x 8 x i32> %[[SEL1]], <vscale x 8 x i32> %[[SEL2]]
+; CHECK-NEXT: call i32 @llvm.vector.reduce.umax.nxv8i32(<vscale x 8 x i32> %[[SEL]])
entry:
br label %for.body
; FADD (FAST)
define float @fadd_fast(float* noalias nocapture readonly %a, i64 %n) {
-; CHECK-LABEL: @fadd_fast(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> zeroinitializer, float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 8 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP9]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <vscale x 8 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], 8
-; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 [[TMP15]]
-; CHECK-NEXT: [[TMP17:%.*]] = bitcast float* [[TMP16]] to <vscale x 8 x float>*
-; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP17]], align 4
-; CHECK-NEXT: [[TMP18]] = fadd fast <vscale x 8 x float> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP19]] = fadd fast <vscale x 8 x float> [[WIDE_LOAD2]], [[VEC_PHI1]]
-; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 16
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP21]]
-; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <vscale x 8 x float> [[TMP19]], [[TMP18]]
-; CHECK-NEXT: [[TMP23:%.*]] = call fast float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> [[BIN_RDX]])
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-NEXT: [[TMP24:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[ADD]] = fadd fast float [[TMP24]], [[SUM_07]]
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: ret float [[ADD_LCSSA]]
-;
+; CHECK-LABEL: @fadd_fast
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x float>
+; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x float>
+; CHECK: %[[ADD1:.*]] = fadd fast <vscale x 8 x float> %[[LOAD1]]
+; CHECK: %[[ADD2:.*]] = fadd fast <vscale x 8 x float> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[ADD:.*]] = fadd fast <vscale x 8 x float> %[[ADD2]], %[[ADD1]]
+; CHECK-NEXT: call fast float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> %[[ADD]])
entry:
br label %for.body
; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop.
; CHECK-REMARK: vectorized loop (vectorization width: 8, interleaved count: 2)
define bfloat @fadd_fast_bfloat(bfloat* noalias nocapture readonly %a, i64 %n) {
-; CHECK-LABEL: @fadd_fast_bfloat(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 16
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x bfloat> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <8 x bfloat> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds bfloat, bfloat* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds bfloat, bfloat* [[A]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds bfloat, bfloat* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast bfloat* [[TMP4]] to <8 x bfloat>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x bfloat>, <8 x bfloat>* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds bfloat, bfloat* [[TMP2]], i32 8
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast bfloat* [[TMP6]] to <8 x bfloat>*
-; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x bfloat>, <8 x bfloat>* [[TMP7]], align 4
-; CHECK-NEXT: [[TMP8]] = fadd fast <8 x bfloat> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP9]] = fadd fast <8 x bfloat> [[WIDE_LOAD2]], [[VEC_PHI1]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <8 x bfloat> [[TMP9]], [[TMP8]]
-; CHECK-NEXT: [[TMP11:%.*]] = call fast bfloat @llvm.vector.reduce.fadd.v8bf16(bfloat 0xR8000, <8 x bfloat> [[BIN_RDX]])
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi bfloat [ 0xR0000, [[ENTRY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[SUM_07:%.*]] = phi bfloat [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds bfloat, bfloat* [[A]], i64 [[IV]]
-; CHECK-NEXT: [[TMP12:%.*]] = load bfloat, bfloat* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[ADD]] = fadd fast bfloat [[TMP12]], [[SUM_07]]
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi bfloat [ [[ADD]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: ret bfloat [[ADD_LCSSA]]
-;
+; CHECK-LABEL: @fadd_fast_bfloat
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <8 x bfloat>
+; CHECK: %[[LOAD2:.*]] = load <8 x bfloat>
+; CHECK: %[[FADD1:.*]] = fadd fast <8 x bfloat> %[[LOAD1]]
+; CHECK: %[[FADD2:.*]] = fadd fast <8 x bfloat> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[RDX:.*]] = fadd fast <8 x bfloat> %[[FADD2]], %[[FADD1]]
+; CHECK: call fast bfloat @llvm.vector.reduce.fadd.v8bf16(bfloat 0xR8000, <8 x bfloat> %[[RDX]])
entry:
br label %for.body
; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
define float @fmin_fast(float* noalias nocapture readonly %a, i64 %n) #0 {
-; CHECK-LABEL: @fmin_fast(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 8 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP9]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <vscale x 8 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], 8
-; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 [[TMP15]]
-; CHECK-NEXT: [[TMP17:%.*]] = bitcast float* [[TMP16]] to <vscale x 8 x float>*
-; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP17]], align 4
-; CHECK-NEXT: [[TMP18:%.*]] = fcmp olt <vscale x 8 x float> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP19:%.*]] = fcmp olt <vscale x 8 x float> [[WIDE_LOAD2]], [[VEC_PHI1]]
-; CHECK-NEXT: [[TMP20]] = select <vscale x 8 x i1> [[TMP18]], <vscale x 8 x float> [[WIDE_LOAD]], <vscale x 8 x float> [[VEC_PHI]]
-; CHECK-NEXT: [[TMP21]] = select <vscale x 8 x i1> [[TMP19]], <vscale x 8 x float> [[WIDE_LOAD2]], <vscale x 8 x float> [[VEC_PHI1]]
-; CHECK-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 16
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP23]]
-; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp olt <vscale x 8 x float> [[TMP20]], [[TMP21]]
-; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <vscale x 8 x i1> [[RDX_MINMAX_CMP]], <vscale x 8 x float> [[TMP20]], <vscale x 8 x float> [[TMP21]]
-; CHECK-NEXT: [[TMP25:%.*]] = call float @llvm.vector.reduce.fmin.nxv8f32(<vscale x 8 x float> [[RDX_MINMAX_SELECT]])
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[DOTSROA_SPECULATED:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-NEXT: [[TMP26:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[CMP_I:%.*]] = fcmp olt float [[TMP26]], [[SUM_07]]
-; CHECK-NEXT: [[DOTSROA_SPECULATED]] = select i1 [[CMP_I]], float [[TMP26]], float [[SUM_07]]
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: [[DOTSROA_SPECULATED_LCSSA:%.*]] = phi float [ [[DOTSROA_SPECULATED]], [[FOR_BODY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: ret float [[DOTSROA_SPECULATED_LCSSA]]
-;
+; CHECK-LABEL: @fmin_fast
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x float>
+; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x float>
+; CHECK: %[[FCMP1:.*]] = fcmp olt <vscale x 8 x float> %[[LOAD1]]
+; CHECK: %[[FCMP2:.*]] = fcmp olt <vscale x 8 x float> %[[LOAD2]]
+; CHECK: %[[SEL1:.*]] = select <vscale x 8 x i1> %[[FCMP1]], <vscale x 8 x float> %[[LOAD1]]
+; CHECK: %[[SEL2:.*]] = select <vscale x 8 x i1> %[[FCMP2]], <vscale x 8 x float> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[FCMP:.*]] = fcmp olt <vscale x 8 x float> %[[SEL1]], %[[SEL2]]
+; CHECK-NEXT: %[[SEL:.*]] = select <vscale x 8 x i1> %[[FCMP]], <vscale x 8 x float> %[[SEL1]], <vscale x 8 x float> %[[SEL2]]
+; CHECK-NEXT: call float @llvm.vector.reduce.fmin.nxv8f32(<vscale x 8 x float> %[[SEL]])
entry:
br label %for.body
; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
define float @fmax_fast(float* noalias nocapture readonly %a, i64 %n) #0 {
-; CHECK-LABEL: @fmax_fast(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 8 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP9]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <vscale x 8 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], 8
-; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 [[TMP15]]
-; CHECK-NEXT: [[TMP17:%.*]] = bitcast float* [[TMP16]] to <vscale x 8 x float>*
-; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP17]], align 4
-; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast ogt <vscale x 8 x float> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP19:%.*]] = fcmp fast ogt <vscale x 8 x float> [[WIDE_LOAD2]], [[VEC_PHI1]]
-; CHECK-NEXT: [[TMP20]] = select <vscale x 8 x i1> [[TMP18]], <vscale x 8 x float> [[WIDE_LOAD]], <vscale x 8 x float> [[VEC_PHI]]
-; CHECK-NEXT: [[TMP21]] = select <vscale x 8 x i1> [[TMP19]], <vscale x 8 x float> [[WIDE_LOAD2]], <vscale x 8 x float> [[VEC_PHI1]]
-; CHECK-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 16
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP23]]
-; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <vscale x 8 x float> [[TMP20]], [[TMP21]]
-; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select fast <vscale x 8 x i1> [[RDX_MINMAX_CMP]], <vscale x 8 x float> [[TMP20]], <vscale x 8 x float> [[TMP21]]
-; CHECK-NEXT: [[TMP25:%.*]] = call fast float @llvm.vector.reduce.fmax.nxv8f32(<vscale x 8 x float> [[RDX_MINMAX_SELECT]])
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[DOTSROA_SPECULATED:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-NEXT: [[TMP26:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[CMP_I:%.*]] = fcmp fast ogt float [[TMP26]], [[SUM_07]]
-; CHECK-NEXT: [[DOTSROA_SPECULATED]] = select i1 [[CMP_I]], float [[TMP26]], float [[SUM_07]]
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: [[DOTSROA_SPECULATED_LCSSA:%.*]] = phi float [ [[DOTSROA_SPECULATED]], [[FOR_BODY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: ret float [[DOTSROA_SPECULATED_LCSSA]]
-;
+; CHECK-LABEL: @fmax_fast
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x float>
+; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x float>
+; CHECK: %[[FCMP1:.*]] = fcmp fast ogt <vscale x 8 x float> %[[LOAD1]]
+; CHECK: %[[FCMP2:.*]] = fcmp fast ogt <vscale x 8 x float> %[[LOAD2]]
+; CHECK: %[[SEL1:.*]] = select <vscale x 8 x i1> %[[FCMP1]], <vscale x 8 x float> %[[LOAD1]]
+; CHECK: %[[SEL2:.*]] = select <vscale x 8 x i1> %[[FCMP2]], <vscale x 8 x float> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[FCMP:.*]] = fcmp fast ogt <vscale x 8 x float> %[[SEL1]], %[[SEL2]]
+; CHECK-NEXT: %[[SEL:.*]] = select fast <vscale x 8 x i1> %[[FCMP]], <vscale x 8 x float> %[[SEL1]], <vscale x 8 x float> %[[SEL2]]
+; CHECK-NEXT: call fast float @llvm.vector.reduce.fmax.nxv8f32(<vscale x 8 x float> %[[SEL]])
entry:
br label %for.body
; CHECK-REMARK: loop not vectorized: value that could not be identified as reduction is used outside the loop
define void @invariant_store(i32* %dst, i32* readonly %src) {
-; CHECK-LABEL: @invariant_store(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 42
-; CHECK-NEXT: store i32 0, i32* [[GEP_DST]], align 4
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[SUM:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[GEP_SRC]], align 4
-; CHECK-NEXT: [[ADD]] = add nsw i32 [[SUM]], [[TMP0]]
-; CHECK-NEXT: store i32 [[ADD]], i32* [[GEP_DST]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: ret void
-;
+; CHECK-LABEL: @invariant_store
+; CHECK-NOT: vector.body
entry:
%gep.dst = getelementptr inbounds i32, i32* %dst, i64 42
store i32 0, i32* %gep.dst, align 4
; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop.
; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 2)
define i32 @mul(i32* nocapture %a, i32* nocapture readonly %b, i64 %n) {
-; CHECK-LABEL: @mul(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 2, i32 1, i32 1, i32 1>, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ <i32 1, i32 1, i32 1, i32 1>, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 4
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4
-; CHECK-NEXT: [[TMP8]] = mul <4 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP9]] = mul <4 x i32> [[WIDE_LOAD2]], [[VEC_PHI1]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[BIN_RDX:%.*]] = mul <4 x i32> [[TMP9]], [[TMP8]]
-; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[BIN_RDX]])
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, [[ENTRY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[SUM_07:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MUL:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
-; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[MUL]] = mul nsw i32 [[TMP12]], [[SUM_07]]
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: [[MUL_LCSSA:%.*]] = phi i32 [ [[MUL]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: ret i32 [[MUL_LCSSA]]
-;
+; CHECK-LABEL: @mul
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <4 x i32>
+; CHECK: %[[LOAD2:.*]] = load <4 x i32>
+; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD1]]
+; CHECK: %[[MUL2:.*]] = mul <4 x i32> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[RDX:.*]] = mul <4 x i32> %[[MUL2]], %[[MUL1]]
+; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[RDX]])
entry:
br label %for.body
; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop.
; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 2)
define i32 @memory_dependence(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i64 %n) {
-; CHECK-LABEL: @memory_dependence(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 2, i32 1, i32 1, i32 1>, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ <i32 1, i32 1, i32 1, i32 1>, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 4
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 4
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[WIDE_LOAD3]], [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD4]], [[WIDE_LOAD2]]
-; CHECK-NEXT: [[TMP16:%.*]] = add nuw nsw i64 [[TMP0]], 32
-; CHECK-NEXT: [[TMP17:%.*]] = add nuw nsw i64 [[TMP1]], 32
-; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP16]]
-; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP17]]
-; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP18]], i32 0
-; CHECK-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <4 x i32>*
-; CHECK-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP21]], align 4
-; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP18]], i32 4
-; CHECK-NEXT: [[TMP23:%.*]] = bitcast i32* [[TMP22]] to <4 x i32>*
-; CHECK-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP23]], align 4
-; CHECK-NEXT: [[TMP24]] = mul <4 x i32> [[WIDE_LOAD3]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP25]] = mul <4 x i32> [[WIDE_LOAD4]], [[VEC_PHI1]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[BIN_RDX:%.*]] = mul <4 x i32> [[TMP25]], [[TMP24]]
-; CHECK-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[BIN_RDX]])
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, [[ENTRY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[SUM:%.*]] = phi i32 [ [[MUL:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I]]
-; CHECK-NEXT: [[TMP28:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
-; CHECK-NEXT: [[TMP29:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4
-; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP29]], [[TMP28]]
-; CHECK-NEXT: [[ADD2:%.*]] = add nuw nsw i64 [[I]], 32
-; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[ADD2]]
-; CHECK-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX3]], align 4
-; CHECK-NEXT: [[MUL]] = mul nsw i32 [[TMP29]], [[SUM]]
-; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: [[MUL_LCSSA:%.*]] = phi i32 [ [[MUL]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: ret i32 [[MUL_LCSSA]]
-;
+; CHECK-LABEL: @memory_dependence
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <4 x i32>
+; CHECK: %[[LOAD2:.*]] = load <4 x i32>
+; CHECK: %[[LOAD3:.*]] = load <4 x i32>
+; CHECK: %[[LOAD4:.*]] = load <4 x i32>
+; CHECK: %[[ADD1:.*]] = add nsw <4 x i32> %[[LOAD3]], %[[LOAD1]]
+; CHECK: %[[ADD2:.*]] = add nsw <4 x i32> %[[LOAD4]], %[[LOAD2]]
+; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD3]]
+; CHECK: %[[MUL2:.*]] = mul <4 x i32> %[[LOAD4]]
+; CHECK: middle.block:
+; CHECK: %[[RDX:.*]] = mul <4 x i32> %[[MUL2]], %[[MUL1]]
+; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[RDX]])
entry:
br label %for.body
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -mattr=+sve -force-ordered-reductions=false -hints-allow-reordering=false -S 2>%t | FileCheck %s --check-prefix=CHECK-NOT-VECTORIZED
; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -mattr=+sve -force-ordered-reductions=false -hints-allow-reordering=true -S 2>%t | FileCheck %s --check-prefix=CHECK-UNORDERED
; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -mattr=+sve -force-ordered-reductions=true -hints-allow-reordering=false -S 2>%t | FileCheck %s --check-prefix=CHECK-ORDERED
; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -mattr=+sve -hints-allow-reordering=false -S 2>%t | FileCheck %s --check-prefix=CHECK-ORDERED
define float @fadd_strict(float* noalias nocapture readonly %a, i64 %n) #0 {
-; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict(
-; CHECK-NOT-VECTORIZED-NEXT: entry:
-; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED: for.body:
-; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[ADD]] = fadd float [[TMP0]], [[SUM_07]]
-; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK-NOT-VECTORIZED: for.end:
-; CHECK-NOT-VECTORIZED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: ret float [[ADD_LCSSA]]
-;
-; CHECK-UNORDERED-LABEL: @fadd_strict(
-; CHECK-UNORDERED-NEXT: entry:
-; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
-; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-UNORDERED: vector.ph:
-; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
-; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-UNORDERED: vector.body:
-; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP4]]
-; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0
-; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <vscale x 8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP7]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP8]] = fadd <vscale x 8 x float> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 8
-; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
-; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK-UNORDERED: middle.block:
-; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> [[TMP8]])
-; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-UNORDERED: scalar.ph:
-; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED: for.body:
-; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT: [[ADD]] = fadd float [[TMP13]], [[SUM_07]]
-; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK-UNORDERED: for.end:
-; CHECK-UNORDERED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: ret float [[ADD_LCSSA]]
-;
-; CHECK-ORDERED-LABEL: @fadd_strict(
-; CHECK-ORDERED-NEXT: entry:
-; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
-; CHECK-ORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-ORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-ORDERED: vector.ph:
-; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
-; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-ORDERED: vector.body:
-; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP4]]
-; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0
-; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <vscale x 8 x float>*
-; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP7]], align 4
-; CHECK-ORDERED-NEXT: [[TMP8]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[WIDE_LOAD]])
-; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 8
-; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
-; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-ORDERED-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK-ORDERED: middle.block:
-; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-ORDERED: scalar.ph:
-; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED: for.body:
-; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT: [[ADD]] = fadd float [[TMP12]], [[SUM_07]]
-; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK-ORDERED: for.end:
-; CHECK-ORDERED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT: ret float [[ADD_LCSSA]]
-;
-
-
+; CHECK-ORDERED-LABEL: @fadd_strict
+; CHECK-ORDERED: vector.body:
+; CHECK-ORDERED: %[[VEC_PHI:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX:.*]], %vector.body ]
+; CHECK-ORDERED: %[[LOAD:.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: %[[RDX]] = call float @llvm.vector.reduce.fadd.nxv8f32(float %[[VEC_PHI]], <vscale x 8 x float> %[[LOAD]])
+; CHECK-ORDERED: for.end
+; CHECK-ORDERED: %[[PHI:.*]] = phi float [ %[[SCALAR:.*]], %for.body ], [ %[[RDX]], %middle.block ]
+; CHECK-ORDERED: ret float %[[PHI]]
+
+; CHECK-UNORDERED-LABEL: @fadd_strict
+; CHECK-UNORDERED: vector.body
+; CHECK-UNORDERED: %[[VEC_PHI:.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), float 0.000000e+00, i32 0), %vector.ph ], [ %[[FADD_VEC:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[LOAD_VEC:.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: %[[FADD_VEC]] = fadd <vscale x 8 x float> %[[LOAD_VEC]], %[[VEC_PHI]]
+; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd
+; CHECK-UNORDERED: middle.block
+; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> %[[FADD_VEC]])
+; CHECK-UNORDERED: for.body
+; CHECK-UNORDERED: %[[LOAD:.*]] = load float, float*
+; CHECK-UNORDERED: %[[FADD:.*]] = fadd float %[[LOAD]], {{.*}}
+; CHECK-UNORDERED: for.end
+; CHECK-UNORDERED: %[[RES:.*]] = phi float [ %[[FADD]], %for.body ], [ %[[RDX]], %middle.block ]
+; CHECK-UNORDERED: ret float %[[RES]]
+
+; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict
+; CHECK-NOT-VECTORIZED-NOT: vector.body
entry:
br label %for.body
}
define float @fadd_strict_unroll(float* noalias nocapture readonly %a, i64 %n) #0 {
-; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict_unroll(
-; CHECK-NOT-VECTORIZED-NEXT: entry:
-; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED: for.body:
-; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[ADD]] = fadd float [[TMP0]], [[SUM_07]]
-; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK-NOT-VECTORIZED: for.end:
-; CHECK-NOT-VECTORIZED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: ret float [[ADD_LCSSA]]
-;
-; CHECK-UNORDERED-LABEL: @fadd_strict_unroll(
-; CHECK-UNORDERED-NEXT: entry:
-; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 32
-; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-UNORDERED: vector.ph:
-; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 32
-; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-UNORDERED: vector.body:
-; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP38:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP39:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI2:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP40:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI3:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP41:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 16
-; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 0
-; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 1
-; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], [[TMP13]]
-; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 24
-; CHECK-UNORDERED-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 0
-; CHECK-UNORDERED-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 1
-; CHECK-UNORDERED-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], [[TMP18]]
-; CHECK-UNORDERED-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP4]]
-; CHECK-UNORDERED-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP9]]
-; CHECK-UNORDERED-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP14]]
-; CHECK-UNORDERED-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP19]]
-; CHECK-UNORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 0
-; CHECK-UNORDERED-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to <vscale x 8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP25]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP26:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-UNORDERED-NEXT: [[TMP27:%.*]] = mul i32 [[TMP26]], 8
-; CHECK-UNORDERED-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 [[TMP27]]
-; CHECK-UNORDERED-NEXT: [[TMP29:%.*]] = bitcast float* [[TMP28]] to <vscale x 8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP29]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP30:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-UNORDERED-NEXT: [[TMP31:%.*]] = mul i32 [[TMP30]], 16
-; CHECK-UNORDERED-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 [[TMP31]]
-; CHECK-UNORDERED-NEXT: [[TMP33:%.*]] = bitcast float* [[TMP32]] to <vscale x 8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP33]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP34:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-UNORDERED-NEXT: [[TMP35:%.*]] = mul i32 [[TMP34]], 24
-; CHECK-UNORDERED-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 [[TMP35]]
-; CHECK-UNORDERED-NEXT: [[TMP37:%.*]] = bitcast float* [[TMP36]] to <vscale x 8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD6:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP37]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP38]] = fadd <vscale x 8 x float> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-UNORDERED-NEXT: [[TMP39]] = fadd <vscale x 8 x float> [[WIDE_LOAD4]], [[VEC_PHI1]]
-; CHECK-UNORDERED-NEXT: [[TMP40]] = fadd <vscale x 8 x float> [[WIDE_LOAD5]], [[VEC_PHI2]]
-; CHECK-UNORDERED-NEXT: [[TMP41]] = fadd <vscale x 8 x float> [[WIDE_LOAD6]], [[VEC_PHI3]]
-; CHECK-UNORDERED-NEXT: [[TMP42:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT: [[TMP43:%.*]] = mul i64 [[TMP42]], 32
-; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP43]]
-; CHECK-UNORDERED-NEXT: [[TMP44:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT: br i1 [[TMP44]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK-UNORDERED: middle.block:
-; CHECK-UNORDERED-NEXT: [[BIN_RDX:%.*]] = fadd <vscale x 8 x float> [[TMP39]], [[TMP38]]
-; CHECK-UNORDERED-NEXT: [[BIN_RDX7:%.*]] = fadd <vscale x 8 x float> [[TMP40]], [[BIN_RDX]]
-; CHECK-UNORDERED-NEXT: [[BIN_RDX8:%.*]] = fadd <vscale x 8 x float> [[TMP41]], [[BIN_RDX7]]
-; CHECK-UNORDERED-NEXT: [[TMP45:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> [[BIN_RDX8]])
-; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-UNORDERED: scalar.ph:
-; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP45]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED: for.body:
-; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT: [[TMP46:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT: [[ADD]] = fadd float [[TMP46]], [[SUM_07]]
-; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK-UNORDERED: for.end:
-; CHECK-UNORDERED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP45]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: ret float [[ADD_LCSSA]]
-;
-; CHECK-ORDERED-LABEL: @fadd_strict_unroll(
-; CHECK-ORDERED-NEXT: entry:
-; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 32
-; CHECK-ORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-ORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-ORDERED: vector.ph:
-; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 32
-; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-ORDERED: vector.body:
-; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP41:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 16
-; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 0
-; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 1
-; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], [[TMP13]]
-; CHECK-ORDERED-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 24
-; CHECK-ORDERED-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 0
-; CHECK-ORDERED-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 1
-; CHECK-ORDERED-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], [[TMP18]]
-; CHECK-ORDERED-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP4]]
-; CHECK-ORDERED-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP9]]
-; CHECK-ORDERED-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP14]]
-; CHECK-ORDERED-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP19]]
-; CHECK-ORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 0
-; CHECK-ORDERED-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to <vscale x 8 x float>*
-; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP25]], align 4
-; CHECK-ORDERED-NEXT: [[TMP26:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-ORDERED-NEXT: [[TMP27:%.*]] = mul i32 [[TMP26]], 8
-; CHECK-ORDERED-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 [[TMP27]]
-; CHECK-ORDERED-NEXT: [[TMP29:%.*]] = bitcast float* [[TMP28]] to <vscale x 8 x float>*
-; CHECK-ORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP29]], align 4
-; CHECK-ORDERED-NEXT: [[TMP30:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-ORDERED-NEXT: [[TMP31:%.*]] = mul i32 [[TMP30]], 16
-; CHECK-ORDERED-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 [[TMP31]]
-; CHECK-ORDERED-NEXT: [[TMP33:%.*]] = bitcast float* [[TMP32]] to <vscale x 8 x float>*
-; CHECK-ORDERED-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP33]], align 4
-; CHECK-ORDERED-NEXT: [[TMP34:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-ORDERED-NEXT: [[TMP35:%.*]] = mul i32 [[TMP34]], 24
-; CHECK-ORDERED-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 [[TMP35]]
-; CHECK-ORDERED-NEXT: [[TMP37:%.*]] = bitcast float* [[TMP36]] to <vscale x 8 x float>*
-; CHECK-ORDERED-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP37]], align 4
-; CHECK-ORDERED-NEXT: [[TMP38:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[WIDE_LOAD]])
-; CHECK-ORDERED-NEXT: [[TMP39:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP38]], <vscale x 8 x float> [[WIDE_LOAD1]])
-; CHECK-ORDERED-NEXT: [[TMP40:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP39]], <vscale x 8 x float> [[WIDE_LOAD2]])
-; CHECK-ORDERED-NEXT: [[TMP41]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP40]], <vscale x 8 x float> [[WIDE_LOAD3]])
-; CHECK-ORDERED-NEXT: [[TMP42:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT: [[TMP43:%.*]] = mul i64 [[TMP42]], 32
-; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP43]]
-; CHECK-ORDERED-NEXT: [[TMP44:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-ORDERED-NEXT: br i1 [[TMP44]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK-ORDERED: middle.block:
-; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-ORDERED: scalar.ph:
-; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED: for.body:
-; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-ORDERED-NEXT: [[TMP45:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT: [[ADD]] = fadd float [[TMP45]], [[SUM_07]]
-; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK-ORDERED: for.end:
-; CHECK-ORDERED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP41]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT: ret float [[ADD_LCSSA]]
-;
-
-
+; CHECK-ORDERED-LABEL: @fadd_strict_unroll
+; CHECK-ORDERED: vector.body:
+; CHECK-ORDERED: %[[VEC_PHI1:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX4:.*]], %vector.body ]
+; CHECK-ORDERED-NOT: phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX4]], %vector.body ]
+; CHECK-ORDERED: %[[LOAD1:.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: %[[LOAD2:.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: %[[LOAD3:.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: %[[LOAD4:.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: %[[RDX1:.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float %[[VEC_PHI1]], <vscale x 8 x float> %[[LOAD1]])
+; CHECK-ORDERED: %[[RDX2:.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float %[[RDX1]], <vscale x 8 x float> %[[LOAD2]])
+; CHECK-ORDERED: %[[RDX3:.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float %[[RDX2]], <vscale x 8 x float> %[[LOAD3]])
+; CHECK-ORDERED: %[[RDX4]] = call float @llvm.vector.reduce.fadd.nxv8f32(float %[[RDX3]], <vscale x 8 x float> %[[LOAD4]])
+; CHECK-ORDERED: for.end
+; CHECK-ORDERED: %[[PHI:.*]] = phi float [ %[[SCALAR:.*]], %for.body ], [ %[[RDX4]], %middle.block ]
+; CHECK-ORDERED: ret float %[[PHI]]
+
+; CHECK-UNORDERED-LABEL: @fadd_strict_unroll
+; CHECK-UNORDERED: vector.body
+; CHECK-UNORDERED: %[[VEC_PHI1:.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), float 0.000000e+00, i32 0), %vector.ph ], [ %[[VEC_FADD1:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[VEC_PHI2:.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), %vector.ph ], [ %[[VEC_FADD2:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[VEC_PHI3:.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), %vector.ph ], [ %[[VEC_FADD3:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[VEC_PHI4:.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), %vector.ph ], [ %[[VEC_FADD4:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[VEC_LOAD1:.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: %[[VEC_LOAD2:.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: %[[VEC_LOAD3:.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: %[[VEC_LOAD4:.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: %[[VEC_FADD1]] = fadd <vscale x 8 x float> %[[VEC_LOAD1]], %[[VEC_PHI1]]
+; CHECK-UNORDERED: %[[VEC_FADD2]] = fadd <vscale x 8 x float> %[[VEC_LOAD2]], %[[VEC_PHI2]]
+; CHECK-UNORDERED: %[[VEC_FADD3]] = fadd <vscale x 8 x float> %[[VEC_LOAD3]], %[[VEC_PHI3]]
+; CHECK-UNORDERED: %[[VEC_FADD4]] = fadd <vscale x 8 x float> %[[VEC_LOAD4]], %[[VEC_PHI4]]
+; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd
+; CHECK-UNORDERED: middle.block
+; CHECK-UNORDERED: %[[BIN_RDX1:.*]] = fadd <vscale x 8 x float> %[[VEC_FADD2]], %[[VEC_FADD1]]
+; CHECK-UNORDERED: %[[BIN_RDX2:.*]] = fadd <vscale x 8 x float> %[[VEC_FADD3]], %[[BIN_RDX1]]
+; CHECK-UNORDERED: %[[BIN_RDX3:.*]] = fadd <vscale x 8 x float> %[[VEC_FADD4]], %[[BIN_RDX2]]
+; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> %[[BIN_RDX3]])
+; CHECK-UNORDERED: for.body
+; CHECK-UNORDERED: %[[LOAD:.*]] = load float, float*
+; CHECK-UNORDERED: %[[FADD:.*]] = fadd float %[[LOAD]], {{.*}}
+; CHECK-UNORDERED: for.end
+; CHECK-UNORDERED: %[[RES:.*]] = phi float [ %[[FADD]], %for.body ], [ %[[RDX]], %middle.block ]
+; CHECK-UNORDERED: ret float %[[RES]]
+
+; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict_unroll
+; CHECK-NOT-VECTORIZED-NOT: vector.body
entry:
br label %for.body
}
define void @fadd_strict_interleave(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) #0 {
-; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict_interleave(
-; CHECK-NOT-VECTORIZED-NEXT: entry:
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDXA:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 1
-; CHECK-NOT-VECTORIZED-NEXT: [[A1:%.*]] = load float, float* [[A]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[A2:%.*]] = load float, float* [[ARRAYIDXA]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED: for.body:
-; CHECK-NOT-VECTORIZED-NEXT: [[ADD_PHI1:%.*]] = phi float [ [[A2]], [[ENTRY:%.*]] ], [ [[ADD2:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[ADD_PHI2:%.*]] = phi float [ [[A1]], [[ENTRY]] ], [ [[ADD1:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDXB1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDXB1]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[ADD1]] = fadd float [[TMP0]], [[ADD_PHI2]]
-; CHECK-NOT-VECTORIZED-NEXT: [[OR:%.*]] = or i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDXB2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[OR]]
-; CHECK-NOT-VECTORIZED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDXB2]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[ADD2]] = fadd float [[TMP1]], [[ADD_PHI1]]
-; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 2
-; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK-NOT-VECTORIZED: for.end:
-; CHECK-NOT-VECTORIZED-NEXT: [[ADD1_LCSSA:%.*]] = phi float [ [[ADD1]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[ADD2_LCSSA:%.*]] = phi float [ [[ADD2]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: store float [[ADD1_LCSSA]], float* [[A]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: store float [[ADD2_LCSSA]], float* [[ARRAYIDXA]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: ret void
-;
-; CHECK-UNORDERED-LABEL: @fadd_strict_interleave(
-; CHECK-UNORDERED-NEXT: entry:
-; CHECK-UNORDERED-NEXT: [[ARRAYIDXA:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 1
-; CHECK-UNORDERED-NEXT: [[A1:%.*]] = load float, float* [[A]], align 4
-; CHECK-UNORDERED-NEXT: [[A2:%.*]] = load float, float* [[ARRAYIDXA]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], -2
-; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 1
-; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = add nuw i64 [[TMP1]], 1
-; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4
-; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]]
-; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-UNORDERED: vector.ph:
-; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
-; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP6]]
-; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 2
-; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = insertelement <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), float [[A2]], i32 0
-; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = insertelement <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), float [[A1]], i32 0
-; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
-; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = add <vscale x 4 x i64> [[TMP9]], zeroinitializer
-; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = mul <vscale x 4 x i64> [[TMP10]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 2, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-UNORDERED-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP11]]
-; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4
-; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = mul i64 2, [[TMP13]]
-; CHECK-UNORDERED-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP14]], i32 0
-; CHECK-UNORDERED-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-UNORDERED: vector.body:
-; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x float> [ [[TMP7]], [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x float> [ [[TMP8]], [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, float* [[B:%.*]], <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-UNORDERED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0f32(<vscale x 4 x float*> [[TMP15]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> undef)
-; CHECK-UNORDERED-NEXT: [[TMP16]] = fadd <vscale x 4 x float> [[WIDE_MASKED_GATHER]], [[VEC_PHI1]]
-; CHECK-UNORDERED-NEXT: [[TMP17:%.*]] = or <vscale x 4 x i64> [[VEC_IND]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-UNORDERED-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[B]], <vscale x 4 x i64> [[TMP17]]
-; CHECK-UNORDERED-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0f32(<vscale x 4 x float*> [[TMP18]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> undef)
-; CHECK-UNORDERED-NEXT: [[TMP19]] = fadd <vscale x 4 x float> [[WIDE_MASKED_GATHER2]], [[VEC_PHI]]
-; CHECK-UNORDERED-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 4
-; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP21]]
-; CHECK-UNORDERED-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; CHECK-UNORDERED-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK-UNORDERED: middle.block:
-; CHECK-UNORDERED-NEXT: [[TMP23:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[TMP16]])
-; CHECK-UNORDERED-NEXT: [[TMP24:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[TMP19]])
-; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-UNORDERED: scalar.ph:
-; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[A2]], [[ENTRY]] ], [ [[TMP24]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX3:%.*]] = phi float [ [[A1]], [[ENTRY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED: for.body:
-; CHECK-UNORDERED-NEXT: [[ADD_PHI1:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD2:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[ADD_PHI2:%.*]] = phi float [ [[BC_MERGE_RDX3]], [[SCALAR_PH]] ], [ [[ADD1:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[ARRAYIDXB1:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT: [[TMP25:%.*]] = load float, float* [[ARRAYIDXB1]], align 4
-; CHECK-UNORDERED-NEXT: [[ADD1]] = fadd float [[TMP25]], [[ADD_PHI2]]
-; CHECK-UNORDERED-NEXT: [[OR:%.*]] = or i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT: [[ARRAYIDXB2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[OR]]
-; CHECK-UNORDERED-NEXT: [[TMP26:%.*]] = load float, float* [[ARRAYIDXB2]], align 4
-; CHECK-UNORDERED-NEXT: [[ADD2]] = fadd float [[TMP26]], [[ADD_PHI1]]
-; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 2
-; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK-UNORDERED: for.end:
-; CHECK-UNORDERED-NEXT: [[ADD1_LCSSA:%.*]] = phi float [ [[ADD1]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: [[ADD2_LCSSA:%.*]] = phi float [ [[ADD2]], [[FOR_BODY]] ], [ [[TMP24]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: store float [[ADD1_LCSSA]], float* [[A]], align 4
-; CHECK-UNORDERED-NEXT: store float [[ADD2_LCSSA]], float* [[ARRAYIDXA]], align 4
-; CHECK-UNORDERED-NEXT: ret void
-;
-; CHECK-ORDERED-LABEL: @fadd_strict_interleave(
-; CHECK-ORDERED-NEXT: entry:
-; CHECK-ORDERED-NEXT: [[ARRAYIDXA:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 1
-; CHECK-ORDERED-NEXT: [[A1:%.*]] = load float, float* [[A]], align 4
-; CHECK-ORDERED-NEXT: [[A2:%.*]] = load float, float* [[ARRAYIDXA]], align 4
-; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], -2
-; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 1
-; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = add nuw i64 [[TMP1]], 1
-; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4
-; CHECK-ORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]]
-; CHECK-ORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-ORDERED: vector.ph:
-; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
-; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP6]]
-; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
-; CHECK-ORDERED-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 2
-; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
-; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = add <vscale x 4 x i64> [[TMP7]], zeroinitializer
-; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = mul <vscale x 4 x i64> [[TMP8]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 2, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-ORDERED-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP9]]
-; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4
-; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = mul i64 2, [[TMP11]]
-; CHECK-ORDERED-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP12]], i32 0
-; CHECK-ORDERED-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-ORDERED: vector.body:
-; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ [[A2]], [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[VEC_PHI1:%.*]] = phi float [ [[A1]], [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[B:%.*]], <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-ORDERED-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0f32(<vscale x 4 x float*> [[TMP13]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> undef)
-; CHECK-ORDERED-NEXT: [[TMP14]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI1]], <vscale x 4 x float> [[WIDE_MASKED_GATHER]])
-; CHECK-ORDERED-NEXT: [[TMP15:%.*]] = or <vscale x 4 x i64> [[VEC_IND]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-ORDERED-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[B]], <vscale x 4 x i64> [[TMP15]]
-; CHECK-ORDERED-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0f32(<vscale x 4 x float*> [[TMP16]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> undef)
-; CHECK-ORDERED-NEXT: [[TMP17]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[WIDE_MASKED_GATHER2]])
-; CHECK-ORDERED-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 4
-; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]]
-; CHECK-ORDERED-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; CHECK-ORDERED-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-ORDERED-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK-ORDERED: middle.block:
-; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
-; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-ORDERED: scalar.ph:
-; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[A2]], [[ENTRY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX3:%.*]] = phi float [ [[A1]], [[ENTRY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED: for.body:
-; CHECK-ORDERED-NEXT: [[ADD_PHI1:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD2:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[ADD_PHI2:%.*]] = phi float [ [[BC_MERGE_RDX3]], [[SCALAR_PH]] ], [ [[ADD1:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[ARRAYIDXB1:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]]
-; CHECK-ORDERED-NEXT: [[TMP21:%.*]] = load float, float* [[ARRAYIDXB1]], align 4
-; CHECK-ORDERED-NEXT: [[ADD1]] = fadd float [[TMP21]], [[ADD_PHI2]]
-; CHECK-ORDERED-NEXT: [[OR:%.*]] = or i64 [[IV]], 1
-; CHECK-ORDERED-NEXT: [[ARRAYIDXB2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[OR]]
-; CHECK-ORDERED-NEXT: [[TMP22:%.*]] = load float, float* [[ARRAYIDXB2]], align 4
-; CHECK-ORDERED-NEXT: [[ADD2]] = fadd float [[TMP22]], [[ADD_PHI1]]
-; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 2
-; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK-ORDERED: for.end:
-; CHECK-ORDERED-NEXT: [[ADD1_LCSSA:%.*]] = phi float [ [[ADD1]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT: [[ADD2_LCSSA:%.*]] = phi float [ [[ADD2]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT: store float [[ADD1_LCSSA]], float* [[A]], align 4
-; CHECK-ORDERED-NEXT: store float [[ADD2_LCSSA]], float* [[ARRAYIDXA]], align 4
-; CHECK-ORDERED-NEXT: ret void
-;
-
-
+; CHECK-ORDERED-LABEL: @fadd_strict_interleave
+; CHECK-ORDERED: entry
+; CHECK-ORDERED: %[[ARRAYIDX:.*]] = getelementptr inbounds float, float* %a, i64 1
+; CHECK-ORDERED: %[[LOAD1:.*]] = load float, float* %a
+; CHECK-ORDERED: %[[LOAD2:.*]] = load float, float* %[[ARRAYIDX]]
+; CHECK-ORDERED: vector.ph
+; CHECK-ORDERED: %[[STEPVEC1:.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; CHECK-ORDERED: %[[STEPVEC_ADD1:.*]] = add <vscale x 4 x i64> %[[STEPVEC1]], zeroinitializer
+; CHECK-ORDERED: %[[STEPVEC_MUL:.*]] = mul <vscale x 4 x i64> %[[STEPVEC_ADD1]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 2, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-ORDERED: %[[INDUCTION:.*]] = add <vscale x 4 x i64> zeroinitializer, %[[STEPVEC_MUL]]
+; CHECK-ORDERED: vector.body
+; CHECK-ORDERED: %[[VEC_PHI2:.*]] = phi float [ %[[LOAD2]], %vector.ph ], [ %[[RDX2:.*]], %vector.body ]
+; CHECK-ORDERED: %[[VEC_PHI1:.*]] = phi float [ %[[LOAD1]], %vector.ph ], [ %[[RDX1:.*]], %vector.body ]
+; CHECK-ORDERED: %[[VEC_IND:.*]] = phi <vscale x 4 x i64> [ %[[INDUCTION]], %vector.ph ], [ {{.*}}, %vector.body ]
+; CHECK-ORDERED: %[[GEP1:.*]] = getelementptr inbounds float, float* %b, <vscale x 4 x i64> %[[VEC_IND]]
+; CHECK-ORDERED: %[[MGATHER1:.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0f32(<vscale x 4 x float*> %[[GEP1]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> undef)
+; CHECK-ORDERED: %[[RDX1]] = call float @llvm.vector.reduce.fadd.nxv4f32(float %[[VEC_PHI1]], <vscale x 4 x float> %[[MGATHER1]])
+; CHECK-ORDERED: %[[OR:.*]] = or <vscale x 4 x i64> %[[VEC_IND]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-ORDERED: %[[GEP2:.*]] = getelementptr inbounds float, float* %b, <vscale x 4 x i64> %[[OR]]
+; CHECK-ORDERED: %[[MGATHER2:.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0f32(<vscale x 4 x float*> %[[GEP2]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> undef)
+; CHECK-ORDERED: %[[RDX2]] = call float @llvm.vector.reduce.fadd.nxv4f32(float %[[VEC_PHI2]], <vscale x 4 x float> %[[MGATHER2]])
+; CHECK-ORDERED: for.end
+; CHECK-ORDERED: ret void
+
+; CHECK-UNORDERED-LABEL: @fadd_strict_interleave
+; CHECK-UNORDERED: entry
+; CHECK-UNORDERED: %[[ARRAYIDX:.*]] = getelementptr inbounds float, float* %a, i64 1
+; CHECK-UNORDERED: %[[LOAD1:.*]] = load float, float* %a
+; CHECK-UNORDERED: %[[LOAD2:.*]] = load float, float* %[[ARRAYIDX]]
+; CHECK-UNORDERED: vector.ph
+; CHECK-UNORDERED: %[[INS_ELT2:.*]] = insertelement <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), float %[[LOAD2]], i32 0
+; CHECK-UNORDERED: %[[INS_ELT1:.*]] = insertelement <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), float %[[LOAD1]], i32 0
+; CHECK-UNORDERED: %[[STEPVEC1:.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; CHECK-UNORDERED: %[[STEPVEC_ADD1:.*]] = add <vscale x 4 x i64> %[[STEPVEC1]], zeroinitializer
+; CHECK-UNORDERED: %[[STEPVEC_MUL:.*]] = mul <vscale x 4 x i64> %[[STEPVEC_ADD1]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 2, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-UNORDERED: %[[INDUCTION:.*]] = add <vscale x 4 x i64> zeroinitializer, %[[STEPVEC_MUL]]
+; CHECK-UNORDERED: vector.body
+; CHECK-UNORDERED: %[[VEC_PHI2:.*]] = phi <vscale x 4 x float> [ %[[INS_ELT2]], %vector.ph ], [ %[[VEC_FADD2:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[VEC_PHI1:.*]] = phi <vscale x 4 x float> [ %[[INS_ELT1]], %vector.ph ], [ %[[VEC_FADD1:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[GEP1:.*]] = getelementptr inbounds float, float* %b, <vscale x 4 x i64>
+; CHECK-UNORDERED: %[[MGATHER1:.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0f32(<vscale x 4 x float*> %[[GEP1]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> undef)
+; CHECK-UNORDERED: %[[VEC_FADD1]] = fadd <vscale x 4 x float> %[[MGATHER1]], %[[VEC_PHI1]]
+; CHECK-UNORDERED: %[[OR:.*]] = or <vscale x 4 x i64> {{.*}}, shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-UNORDERED: %[[GEP2:.*]] = getelementptr inbounds float, float* %b, <vscale x 4 x i64> %[[OR]]
+; CHECK-UNORDERED: %[[MGATHER2:.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0f32(<vscale x 4 x float*> %[[GEP2]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> undef)
+; CHECK-UNORDERED: %[[VEC_FADD2]] = fadd <vscale x 4 x float> %[[MGATHER2]], %[[VEC_PHI2]]
+; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd
+; CHECK-UNORDERED: middle.block
+; CHECK-UNORDERED: %[[VEC_RDX1:.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> %[[VEC_FADD1]])
+; CHECK-UNORDERED: %[[VEC_RDX2:.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> %[[VEC_FADD2]])
+; CHECK-UNORDERED: for.body
+; CHECK-UNORDERED: %[[LOAD3:.*]] = load float, float*
+; CHECK-UNORDERED: %[[FADD1:.*]] = fadd float %[[LOAD3]], {{.*}}
+; CHECK-UNORDERED: %[[LOAD4:.*]] = load float, float*
+; CHECK-UNORDERED: %[[FADD2:.*]] = fadd float %[[LOAD4]], {{.*}}
+; CHECK-UNORDERED: for.end
+; CHECK-UNORDERED: %[[RDX1:.*]] = phi float [ %[[FADD1]], %for.body ], [ %[[VEC_RDX1]], %middle.block ]
+; CHECK-UNORDERED: %[[RDX2:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[VEC_RDX2]], %middle.block ]
+; CHECK-UNORDERED: store float %[[RDX1]], float* %a
+; CHECK-UNORDERED: store float %[[RDX2]], float* {{.*}}
+; CHECK-UNORDERED: ret void
+
+; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict_interleave
+; CHECK-NOT-VECTORIZED-NOT: vector.body
entry:
%arrayidxa = getelementptr inbounds float, float* %a, i64 1
}
define float @fadd_of_sum(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) #0 {
-; CHECK-NOT-VECTORIZED-LABEL: @fadd_of_sum(
-; CHECK-NOT-VECTORIZED-NEXT: entry:
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 1
-; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP0]], 5.000000e-01
-; CHECK-NOT-VECTORIZED-NEXT: br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK-NOT-VECTORIZED: for.body.preheader:
-; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED: for.body:
-; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[RES_014:%.*]] = phi float [ [[RDX:%.*]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX4]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[ADD:%.*]] = fadd float [[TMP1]], [[TMP2]]
-; CHECK-NOT-VECTORIZED-NEXT: [[RDX]] = fadd float [[RES_014]], [[ADD]]
-; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP6]]
-; CHECK-NOT-VECTORIZED: for.end.loopexit:
-; CHECK-NOT-VECTORIZED-NEXT: [[RDX_LCSSA:%.*]] = phi float [ [[RDX]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_END]]
-; CHECK-NOT-VECTORIZED: for.end:
-; CHECK-NOT-VECTORIZED-NEXT: [[RES:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[RDX_LCSSA]], [[FOR_END_LOOPEXIT]] ]
-; CHECK-NOT-VECTORIZED-NEXT: ret float [[RES]]
-;
-; CHECK-UNORDERED-LABEL: @fadd_of_sum(
-; CHECK-UNORDERED-NEXT: entry:
-; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 1
-; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP0]], 5.000000e-01
-; CHECK-UNORDERED-NEXT: br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK-UNORDERED: for.body.preheader:
-; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
-; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP2]]
-; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-UNORDERED: vector.ph:
-; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4
-; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP4]]
-; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-UNORDERED: vector.body:
-; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x float> [ insertelement (<vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0
-; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP5]]
-; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 0
-; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP7]] to <vscale x 4 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[TMP8]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP5]]
-; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP9]], i32 0
-; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to <vscale x 4 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[TMP11]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = fadd <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; CHECK-UNORDERED-NEXT: [[TMP13]] = fadd <vscale x 4 x float> [[VEC_PHI]], [[TMP12]]
-; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4
-; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]]
-; CHECK-UNORDERED-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK-UNORDERED: middle.block:
-; CHECK-UNORDERED-NEXT: [[TMP17:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[TMP13]])
-; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-UNORDERED: scalar.ph:
-; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED: for.body:
-; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-UNORDERED-NEXT: [[RES_014:%.*]] = phi float [ [[RDX:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-UNORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT: [[TMP18:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-UNORDERED-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT: [[TMP19:%.*]] = load float, float* [[ARRAYIDX4]], align 4
-; CHECK-UNORDERED-NEXT: [[ADD:%.*]] = fadd float [[TMP18]], [[TMP19]]
-; CHECK-UNORDERED-NEXT: [[RDX]] = fadd float [[RES_014]], [[ADD]]
-; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK-UNORDERED: for.end.loopexit:
-; CHECK-UNORDERED-NEXT: [[RDX_LCSSA:%.*]] = phi float [ [[RDX]], [[FOR_BODY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: br label [[FOR_END]]
-; CHECK-UNORDERED: for.end:
-; CHECK-UNORDERED-NEXT: [[RES:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[RDX_LCSSA]], [[FOR_END_LOOPEXIT]] ]
-; CHECK-UNORDERED-NEXT: ret float [[RES]]
-;
-; CHECK-ORDERED-LABEL: @fadd_of_sum(
-; CHECK-ORDERED-NEXT: entry:
-; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 1
-; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP0]], 5.000000e-01
-; CHECK-ORDERED-NEXT: br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK-ORDERED: for.body.preheader:
-; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4
-; CHECK-ORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP2]]
-; CHECK-ORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-ORDERED: vector.ph:
-; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4
-; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP4]]
-; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-ORDERED: vector.body:
-; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0
-; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP5]]
-; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 0
-; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP7]] to <vscale x 4 x float>*
-; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[TMP8]], align 4
-; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP5]]
-; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP9]], i32 0
-; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to <vscale x 4 x float>*
-; CHECK-ORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[TMP11]], align 4
-; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = fadd <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; CHECK-ORDERED-NEXT: [[TMP13]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP12]])
-; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4
-; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]]
-; CHECK-ORDERED-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-ORDERED-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK-ORDERED: middle.block:
-; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-ORDERED: scalar.ph:
-; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED: for.body:
-; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-ORDERED-NEXT: [[RES_014:%.*]] = phi float [ [[RDX:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-ORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-ORDERED-NEXT: [[TMP17:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-ORDERED-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]]
-; CHECK-ORDERED-NEXT: [[TMP18:%.*]] = load float, float* [[ARRAYIDX4]], align 4
-; CHECK-ORDERED-NEXT: [[ADD:%.*]] = fadd float [[TMP17]], [[TMP18]]
-; CHECK-ORDERED-NEXT: [[RDX]] = fadd float [[RES_014]], [[ADD]]
-; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK-ORDERED: for.end.loopexit:
-; CHECK-ORDERED-NEXT: [[RDX_LCSSA:%.*]] = phi float [ [[RDX]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT: br label [[FOR_END]]
-; CHECK-ORDERED: for.end:
-; CHECK-ORDERED-NEXT: [[RES:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[RDX_LCSSA]], [[FOR_END_LOOPEXIT]] ]
-; CHECK-ORDERED-NEXT: ret float [[RES]]
-;
-
-
+; CHECK-ORDERED-LABEL: @fadd_of_sum
+; CHECK-ORDERED: vector.body
+; CHECK-ORDERED: %[[VEC_PHI1:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX:.*]], %vector.body ]
+; CHECK-ORDERED: %[[LOAD1:.*]] = load <vscale x 4 x float>, <vscale x 4 x float>*
+; CHECK-ORDERED: %[[LOAD2:.*]] = load <vscale x 4 x float>, <vscale x 4 x float>*
+; CHECK-ORDERED: %[[ADD:.*]] = fadd <vscale x 4 x float> %[[LOAD1]], %[[LOAD2]]
+; CHECK-ORDERED: %[[RDX]] = call float @llvm.vector.reduce.fadd.nxv4f32(float %[[VEC_PHI1]], <vscale x 4 x float> %[[ADD]])
+; CHECK-ORDERED: for.end.loopexit
+; CHECK-ORDERED: %[[EXIT_PHI:.*]] = phi float [ {{.*}}, %for.body ], [ %[[RDX]], %middle.block ]
+; CHECK-ORDERED: for.end
+; CHECK-ORDERED: %[[PHI:.*]] = phi float [ 0.000000e+00, %entry ], [ %[[EXIT_PHI]], %for.end.loopexit ]
+; CHECK-ORDERED: ret float %[[PHI]]
+
+; CHECK-UNORDERED-LABEL: @fadd_of_sum
+; CHECK-UNORDERED: vector.body
+; CHECK-UNORDERED: %[[VEC_PHI:.*]] = phi <vscale x 4 x float> [ insertelement (<vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), float 0.000000e+00, i32 0), %vector.ph ], [ %[[VEC_FADD2:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[VEC_LOAD1:.*]] = load <vscale x 4 x float>, <vscale x 4 x float>*
+; CHECK-UNORDERED: %[[VEC_LOAD2:.*]] = load <vscale x 4 x float>, <vscale x 4 x float>*
+; CHECK-UNORDERED: %[[VEC_FADD1:.*]] = fadd <vscale x 4 x float> %[[VEC_LOAD1]], %[[VEC_LOAD2]]
+; CHECK-UNORDERED: %[[VEC_FADD2]] = fadd <vscale x 4 x float> %[[VEC_PHI]], %[[VEC_FADD1]]
+; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd
+; CHECK-UNORDERED: middle.block
+; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> %[[VEC_FADD2]])
+; CHECK-UNORDERED: for.body
+; CHECK-UNORDERED: %[[LOAD1:.*]] = load float, float*
+; CHECK-UNORDERED: %[[LOAD2:.*]] = load float, float*
+; CHECK-UNORDERED: %[[FADD1:.*]] = fadd float %[[LOAD1]], %[[LOAD2]]
+; CHECK-UNORDERED: %[[FADD2:.*]] = fadd float {{.*}}, %[[FADD1]]
+; CHECK-UNORDERED: for.end.loopexit
+; CHECK-UNORDERED: %[[EXIT:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[RDX]], %middle.block ]
+; CHECK-UNORDERED: for.end
+; CHECK-UNORDERED: %[[SUM:.*]] = phi float [ 0.000000e+00, %entry ], [ %[[EXIT]], %for.end.loopexit ]
+; CHECK-UNORDERED: ret float %[[SUM]]
+
+; CHECK-NOT-VECTORIZED-LABEL: @fadd_of_sum
+; CHECK-NOT-VECTORIZED-NOT: vector.body
entry:
%arrayidx = getelementptr inbounds float, float* %a, i64 1
}
define float @fadd_conditional(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) #0 {
-; CHECK-NOT-VECTORIZED-LABEL: @fadd_conditional(
-; CHECK-NOT-VECTORIZED-NEXT: entry:
-; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED: for.body:
-; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[RES:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[FADD:%.*]], [[FOR_INC]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[TOBOOL:%.*]] = fcmp une float [[TMP0]], 0.000000e+00
-; CHECK-NOT-VECTORIZED-NEXT: br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]]
-; CHECK-NOT-VECTORIZED: if.then:
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_INC]]
-; CHECK-NOT-VECTORIZED: for.inc:
-; CHECK-NOT-VECTORIZED-NEXT: [[PHI:%.*]] = phi float [ [[TMP1]], [[IF_THEN]] ], [ 3.000000e+00, [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[FADD]] = fadd float [[RES]], [[PHI]]
-; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP6]]
-; CHECK-NOT-VECTORIZED: for.end:
-; CHECK-NOT-VECTORIZED-NEXT: [[RDX:%.*]] = phi float [ [[FADD]], [[FOR_INC]] ]
-; CHECK-NOT-VECTORIZED-NEXT: ret float [[RDX]]
-;
-; CHECK-UNORDERED-LABEL: @fadd_conditional(
-; CHECK-UNORDERED-NEXT: entry:
-; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-UNORDERED: vector.ph:
-; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-UNORDERED: vector.body:
-; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x float> [ insertelement (<vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), float 1.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP4]]
-; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0
-; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <vscale x 4 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[TMP7]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = fcmp une <vscale x 4 x float> [[WIDE_LOAD]], zeroinitializer
-; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = getelementptr float, float* [[A:%.*]], i64 [[TMP4]]
-; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = getelementptr float, float* [[TMP9]], i32 0
-; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to <vscale x 4 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>* [[TMP11]], i32 4, <vscale x 4 x i1> [[TMP8]], <vscale x 4 x float> poison)
-; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = xor <vscale x 4 x i1> [[TMP8]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-UNORDERED-NEXT: [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP12]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> [[WIDE_MASKED_LOAD]]
-; CHECK-UNORDERED-NEXT: [[TMP13]] = fadd <vscale x 4 x float> [[VEC_PHI]], [[PREDPHI]]
-; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4
-; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]]
-; CHECK-UNORDERED-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK-UNORDERED: middle.block:
-; CHECK-UNORDERED-NEXT: [[TMP17:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[TMP13]])
-; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-UNORDERED: scalar.ph:
-; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED: for.body:
-; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; CHECK-UNORDERED-NEXT: [[RES:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[FADD:%.*]], [[FOR_INC]] ]
-; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT: [[TMP18:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT: [[TOBOOL:%.*]] = fcmp une float [[TMP18]], 0.000000e+00
-; CHECK-UNORDERED-NEXT: br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]]
-; CHECK-UNORDERED: if.then:
-; CHECK-UNORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT: [[TMP19:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-UNORDERED-NEXT: br label [[FOR_INC]]
-; CHECK-UNORDERED: for.inc:
-; CHECK-UNORDERED-NEXT: [[PHI:%.*]] = phi float [ [[TMP19]], [[IF_THEN]] ], [ 3.000000e+00, [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[FADD]] = fadd float [[RES]], [[PHI]]
-; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK-UNORDERED: for.end:
-; CHECK-UNORDERED-NEXT: [[RDX:%.*]] = phi float [ [[FADD]], [[FOR_INC]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: ret float [[RDX]]
-;
-; CHECK-ORDERED-LABEL: @fadd_conditional(
-; CHECK-ORDERED-NEXT: entry:
-; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-ORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-ORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-ORDERED: vector.ph:
-; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-ORDERED: vector.body:
-; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP4]]
-; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0
-; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <vscale x 4 x float>*
-; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[TMP7]], align 4
-; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = fcmp une <vscale x 4 x float> [[WIDE_LOAD]], zeroinitializer
-; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = getelementptr float, float* [[A:%.*]], i64 [[TMP4]]
-; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = getelementptr float, float* [[TMP9]], i32 0
-; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to <vscale x 4 x float>*
-; CHECK-ORDERED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>* [[TMP11]], i32 4, <vscale x 4 x i1> [[TMP8]], <vscale x 4 x float> poison)
-; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = xor <vscale x 4 x i1> [[TMP8]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-ORDERED-NEXT: [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP12]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> [[WIDE_MASKED_LOAD]]
-; CHECK-ORDERED-NEXT: [[TMP13]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[PREDPHI]])
-; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4
-; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]]
-; CHECK-ORDERED-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-ORDERED-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK-ORDERED: middle.block:
-; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-ORDERED: scalar.ph:
-; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED: for.body:
-; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; CHECK-ORDERED-NEXT: [[RES:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[FADD:%.*]], [[FOR_INC]] ]
-; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]]
-; CHECK-ORDERED-NEXT: [[TMP17:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT: [[TOBOOL:%.*]] = fcmp une float [[TMP17]], 0.000000e+00
-; CHECK-ORDERED-NEXT: br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]]
-; CHECK-ORDERED: if.then:
-; CHECK-ORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-ORDERED-NEXT: [[TMP18:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-ORDERED-NEXT: br label [[FOR_INC]]
-; CHECK-ORDERED: for.inc:
-; CHECK-ORDERED-NEXT: [[PHI:%.*]] = phi float [ [[TMP18]], [[IF_THEN]] ], [ 3.000000e+00, [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[FADD]] = fadd float [[RES]], [[PHI]]
-; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK-ORDERED: for.end:
-; CHECK-ORDERED-NEXT: [[RDX:%.*]] = phi float [ [[FADD]], [[FOR_INC]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT: ret float [[RDX]]
-;
-
-
+; CHECK-ORDERED-LABEL: @fadd_conditional
+; CHECK-ORDERED: vector.body
+; CHECK-ORDERED: %[[VEC_PHI:.*]] = phi float [ 1.000000e+00, %vector.ph ], [ %[[RDX:.*]], %vector.body ]
+; CHECK-ORDERED: %[[LOAD:.*]] = load <vscale x 4 x float>, <vscale x 4 x float>*
+; CHECK-ORDERED: %[[FCMP:.*]] = fcmp une <vscale x 4 x float> %[[LOAD]], zeroinitializer
+; CHECK-ORDERED: %[[MASKED_LOAD:.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>* {{.*}}, i32 4, <vscale x 4 x i1> %[[FCMP]], <vscale x 4 x float> poison)
+; CHECK-ORDERED: %[[XOR:.*]] = xor <vscale x 4 x i1> %[[FCMP]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-ORDERED: %[[SELECT:.*]] = select <vscale x 4 x i1> %[[XOR]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> %[[MASKED_LOAD]]
+; CHECK-ORDERED: %[[RDX]] = call float @llvm.vector.reduce.fadd.nxv4f32(float %[[VEC_PHI]], <vscale x 4 x float> %[[SELECT]])
+; CHECK-ORDERED: scalar.ph
+; CHECK-ORDERED: %[[MERGE_RDX:.*]] = phi float [ 1.000000e+00, %entry ], [ %[[RDX]], %middle.block ]
+; CHECK-ORDERED: for.body
+; CHECK-ORDERED: %[[RES:.*]] = phi float [ %[[MERGE_RDX]], %scalar.ph ], [ %[[FADD:.*]], %for.inc ]
+; CHECK-ORDERED: if.then
+; CHECK-ORDERED: %[[LOAD2:.*]] = load float, float*
+; CHECK-ORDERED: for.inc
+; CHECK-ORDERED: %[[PHI:.*]] = phi float [ %[[LOAD2]], %if.then ], [ 3.000000e+00, %for.body ]
+; CHECK-ORDERED: %[[FADD]] = fadd float %[[RES]], %[[PHI]]
+; CHECK-ORDERED: for.end
+; CHECK-ORDERED: %[[RDX_PHI:.*]] = phi float [ %[[FADD]], %for.inc ], [ %[[RDX]], %middle.block ]
+; CHECK-ORDERED: ret float %[[RDX_PHI]]
+
+; CHECK-UNORDERED-LABEL: @fadd_conditional
+; CHECK-UNORDERED: vector.body
+; CHECK-UNORDERED: %[[VEC_PHI:.*]] = phi <vscale x 4 x float> [ insertelement (<vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), float 1.000000e+00, i32 0), %vector.ph ], [ %[[VEC_FADD:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[LOAD1:.*]] = load <vscale x 4 x float>, <vscale x 4 x float>*
+; CHECK-UNORDERED: %[[FCMP:.*]] = fcmp une <vscale x 4 x float> %[[LOAD1]], zeroinitializer
+; CHECK-UNORDERED: %[[MASKED_LOAD:.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>* {{.*}}, i32 4, <vscale x 4 x i1> %[[FCMP]], <vscale x 4 x float> poison)
+; CHECK-UNORDERED: %[[XOR:.*]] = xor <vscale x 4 x i1> %[[FCMP]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-UNORDERED: %[[SELECT:.*]] = select <vscale x 4 x i1> %[[XOR]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> %[[MASKED_LOAD]]
+; CHECK-UNORDERED: %[[VEC_FADD]] = fadd <vscale x 4 x float> %[[VEC_PHI]], %[[SELECT]]
+; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd
+; CHECK-UNORDERED: middle.block
+; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> %[[VEC_FADD]])
+; CHECK-UNORDERED: for.body
+; CHECK-UNORDERED: %[[RES:.*]] = phi float [ %bc.merge.rdx, %scalar.ph ], [ %[[FADD:.*]], %for.inc ]
+; CHECK-UNORDERED: for.inc
+; CHECK-UNORDERED: %[[FADD]] = fadd float %[[RES]], {{.*}}
+; CHECK-UNORDERED: for.end
+; CHECK-UNORDERED: %[[RDX_PHI:.*]] = phi float [ %[[FADD]], %for.inc ], [ %[[RDX]], %middle.block ]
+; CHECK-UNORDERED: ret float %[[RDX_PHI]]
+
+; CHECK-NOT-VECTORIZED-LABEL: @fadd_conditional
+; CHECK-NOT-VECTORIZED-NOT: vector.body
entry:
br label %for.body
; Negative test - loop contains multiple fadds which we cannot safely reorder
define float @fadd_multiple(float* noalias nocapture %a, float* noalias nocapture %b, i64 %n) #0 {
-; CHECK-NOT-VECTORIZED-LABEL: @fadd_multiple(
-; CHECK-NOT-VECTORIZED-NEXT: entry:
-; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED: for.body:
-; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[SUM:%.*]] = phi float [ -0.000000e+00, [[ENTRY]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[ADD:%.*]] = fadd float [[SUM]], [[TMP0]]
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[ADD3]] = fadd float [[ADD]], [[TMP1]]
-; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]]
-; CHECK-NOT-VECTORIZED: for.end:
-; CHECK-NOT-VECTORIZED-NEXT: [[RDX:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: ret float [[RDX]]
-;
-; CHECK-UNORDERED-LABEL: @fadd_multiple(
-; CHECK-UNORDERED-NEXT: entry:
-; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
-; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-UNORDERED: vector.ph:
-; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
-; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-UNORDERED: vector.body:
-; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), float -0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP4]]
-; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0
-; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <vscale x 8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP7]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = fadd <vscale x 8 x float> [[VEC_PHI]], [[WIDE_LOAD]]
-; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP4]]
-; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP9]], i32 0
-; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to <vscale x 8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP11]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP12]] = fadd <vscale x 8 x float> [[TMP8]], [[WIDE_LOAD1]]
-; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 8
-; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]]
-; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK-UNORDERED: middle.block:
-; CHECK-UNORDERED-NEXT: [[TMP16:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> [[TMP12]])
-; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-UNORDERED: scalar.ph:
-; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ -0.000000e+00, [[ENTRY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED: for.body:
-; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[SUM:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT: [[TMP17:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT: [[ADD:%.*]] = fadd float [[SUM]], [[TMP17]]
-; CHECK-UNORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT: [[TMP18:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-UNORDERED-NEXT: [[ADD3]] = fadd float [[ADD]], [[TMP18]]
-; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK-UNORDERED: for.end:
-; CHECK-UNORDERED-NEXT: [[RDX:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: ret float [[RDX]]
-;
-; CHECK-ORDERED-LABEL: @fadd_multiple(
-; CHECK-ORDERED-NEXT: entry:
-; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED: for.body:
-; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[SUM:%.*]] = phi float [ -0.000000e+00, [[ENTRY]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT: [[ADD:%.*]] = fadd float [[SUM]], [[TMP0]]
-; CHECK-ORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-ORDERED-NEXT: [[ADD3]] = fadd float [[ADD]], [[TMP1]]
-; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK-ORDERED: for.end:
-; CHECK-ORDERED-NEXT: [[RDX:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: ret float [[RDX]]
-;
-
-
+; CHECK-ORDERED-LABEL: @fadd_multiple
+; CHECK-ORDERED-NOT: vector.body
+
+; CHECK-UNORDERED-LABEL: @fadd_multiple
+; CHECK-UNORDERED: vector.body
+; CHECK-UNORDERED: %[[PHI:.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), float -0.000000e+00, i32 0), %vector.ph ], [ %[[VEC_FADD2:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[VEC_LOAD1:.*]] = load <vscale x 8 x float>, <vscale x 8 x float>
+; CHECK-UNORDERED: %[[VEC_FADD1:.*]] = fadd <vscale x 8 x float> %[[PHI]], %[[VEC_LOAD1]]
+; CHECK-UNORDERED: %[[VEC_LOAD2:.*]] = load <vscale x 8 x float>, <vscale x 8 x float>
+; CHECK-UNORDERED: %[[VEC_FADD2]] = fadd <vscale x 8 x float> %[[VEC_FADD1]], %[[VEC_LOAD2]]
+; CHECK-UNORDERED: middle.block
+; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> %[[VEC_FADD2]])
+; CHECK-UNORDERED: for.body
+; CHECK-UNORDERED: %[[SUM:.*]] = phi float [ %bc.merge.rdx, %scalar.ph ], [ %[[FADD2:.*]], %for.body ]
+; CHECK-UNORDERED: %[[LOAD1:.*]] = load float, float*
+; CHECK-UNORDERED: %[[FADD1:.*]] = fadd float %[[SUM]], %[[LOAD1]]
+; CHECK-UNORDERED: %[[LOAD2:.*]] = load float, float*
+; CHECK-UNORDERED: %[[FADD2]] = fadd float %[[FADD1]], %[[LOAD2]]
+; CHECK-UNORDERED: for.end
+; CHECK-UNORDERED: %[[RET:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[RDX]], %middle.block ]
+; CHECK-UNORDERED: ret float %[[RET]]
+
+; CHECK-NOT-VECTORIZED-LABEL: @fadd_multiple
+; CHECK-NOT-VECTORIZED-NOT: vector.body
entry:
br label %for.body
; Test case where loop has a call to the llvm.fmuladd intrinsic.
define float @fmuladd_strict(float* %a, float* %b, i64 %n) #0 {
-; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_strict(
-; CHECK-NOT-VECTORIZED-NEXT: entry:
-; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED: for.body:
-; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[SUM_07]])
-; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]]
-; CHECK-NOT-VECTORIZED: for.end:
-; CHECK-NOT-VECTORIZED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: ret float [[MULADD_LCSSA]]
-;
-; CHECK-UNORDERED-LABEL: @fmuladd_strict(
-; CHECK-UNORDERED-NEXT: entry:
-; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 32
-; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-UNORDERED: vector.ph:
-; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 32
-; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-UNORDERED: vector.body:
-; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP56:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP57:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI2:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP58:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI3:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP59:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 16
-; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 0
-; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 1
-; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], [[TMP13]]
-; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 24
-; CHECK-UNORDERED-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 0
-; CHECK-UNORDERED-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 1
-; CHECK-UNORDERED-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], [[TMP18]]
-; CHECK-UNORDERED-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP4]]
-; CHECK-UNORDERED-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP9]]
-; CHECK-UNORDERED-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP14]]
-; CHECK-UNORDERED-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP19]]
-; CHECK-UNORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 0
-; CHECK-UNORDERED-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to <vscale x 8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP25]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP26:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-UNORDERED-NEXT: [[TMP27:%.*]] = mul i32 [[TMP26]], 8
-; CHECK-UNORDERED-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 [[TMP27]]
-; CHECK-UNORDERED-NEXT: [[TMP29:%.*]] = bitcast float* [[TMP28]] to <vscale x 8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP29]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP30:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-UNORDERED-NEXT: [[TMP31:%.*]] = mul i32 [[TMP30]], 16
-; CHECK-UNORDERED-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 [[TMP31]]
-; CHECK-UNORDERED-NEXT: [[TMP33:%.*]] = bitcast float* [[TMP32]] to <vscale x 8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP33]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP34:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-UNORDERED-NEXT: [[TMP35:%.*]] = mul i32 [[TMP34]], 24
-; CHECK-UNORDERED-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 [[TMP35]]
-; CHECK-UNORDERED-NEXT: [[TMP37:%.*]] = bitcast float* [[TMP36]] to <vscale x 8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD6:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP37]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP4]]
-; CHECK-UNORDERED-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP9]]
-; CHECK-UNORDERED-NEXT: [[TMP40:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP14]]
-; CHECK-UNORDERED-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP19]]
-; CHECK-UNORDERED-NEXT: [[TMP42:%.*]] = getelementptr inbounds float, float* [[TMP38]], i32 0
-; CHECK-UNORDERED-NEXT: [[TMP43:%.*]] = bitcast float* [[TMP42]] to <vscale x 8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD7:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP43]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP44:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-UNORDERED-NEXT: [[TMP45:%.*]] = mul i32 [[TMP44]], 8
-; CHECK-UNORDERED-NEXT: [[TMP46:%.*]] = getelementptr inbounds float, float* [[TMP38]], i32 [[TMP45]]
-; CHECK-UNORDERED-NEXT: [[TMP47:%.*]] = bitcast float* [[TMP46]] to <vscale x 8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD8:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP47]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP48:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-UNORDERED-NEXT: [[TMP49:%.*]] = mul i32 [[TMP48]], 16
-; CHECK-UNORDERED-NEXT: [[TMP50:%.*]] = getelementptr inbounds float, float* [[TMP38]], i32 [[TMP49]]
-; CHECK-UNORDERED-NEXT: [[TMP51:%.*]] = bitcast float* [[TMP50]] to <vscale x 8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD9:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP51]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP52:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-UNORDERED-NEXT: [[TMP53:%.*]] = mul i32 [[TMP52]], 24
-; CHECK-UNORDERED-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, float* [[TMP38]], i32 [[TMP53]]
-; CHECK-UNORDERED-NEXT: [[TMP55:%.*]] = bitcast float* [[TMP54]] to <vscale x 8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD10:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP55]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP56]] = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD]], <vscale x 8 x float> [[WIDE_LOAD7]], <vscale x 8 x float> [[VEC_PHI]])
-; CHECK-UNORDERED-NEXT: [[TMP57]] = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD4]], <vscale x 8 x float> [[WIDE_LOAD8]], <vscale x 8 x float> [[VEC_PHI1]])
-; CHECK-UNORDERED-NEXT: [[TMP58]] = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD5]], <vscale x 8 x float> [[WIDE_LOAD9]], <vscale x 8 x float> [[VEC_PHI2]])
-; CHECK-UNORDERED-NEXT: [[TMP59]] = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD6]], <vscale x 8 x float> [[WIDE_LOAD10]], <vscale x 8 x float> [[VEC_PHI3]])
-; CHECK-UNORDERED-NEXT: [[TMP60:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT: [[TMP61:%.*]] = mul i64 [[TMP60]], 32
-; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP61]]
-; CHECK-UNORDERED-NEXT: [[TMP62:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT: br i1 [[TMP62]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-; CHECK-UNORDERED: middle.block:
-; CHECK-UNORDERED-NEXT: [[BIN_RDX:%.*]] = fadd <vscale x 8 x float> [[TMP57]], [[TMP56]]
-; CHECK-UNORDERED-NEXT: [[BIN_RDX11:%.*]] = fadd <vscale x 8 x float> [[TMP58]], [[BIN_RDX]]
-; CHECK-UNORDERED-NEXT: [[BIN_RDX12:%.*]] = fadd <vscale x 8 x float> [[TMP59]], [[BIN_RDX11]]
-; CHECK-UNORDERED-NEXT: [[TMP63:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> [[BIN_RDX12]])
-; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-UNORDERED: scalar.ph:
-; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP63]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED: for.body:
-; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT: [[TMP64:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT: [[TMP65:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-UNORDERED-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP64]], float [[TMP65]], float [[SUM_07]])
-; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
-; CHECK-UNORDERED: for.end:
-; CHECK-UNORDERED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP63]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: ret float [[MULADD_LCSSA]]
-;
-; CHECK-ORDERED-LABEL: @fmuladd_strict(
-; CHECK-ORDERED-NEXT: entry:
-; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 32
-; CHECK-ORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-ORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-ORDERED: vector.ph:
-; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 32
-; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-ORDERED: vector.body:
-; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP63:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 16
-; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 0
-; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 1
-; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], [[TMP13]]
-; CHECK-ORDERED-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 24
-; CHECK-ORDERED-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 0
-; CHECK-ORDERED-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 1
-; CHECK-ORDERED-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], [[TMP18]]
-; CHECK-ORDERED-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP4]]
-; CHECK-ORDERED-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP9]]
-; CHECK-ORDERED-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP14]]
-; CHECK-ORDERED-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP19]]
-; CHECK-ORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 0
-; CHECK-ORDERED-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to <vscale x 8 x float>*
-; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP25]], align 4
-; CHECK-ORDERED-NEXT: [[TMP26:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-ORDERED-NEXT: [[TMP27:%.*]] = mul i32 [[TMP26]], 8
-; CHECK-ORDERED-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 [[TMP27]]
-; CHECK-ORDERED-NEXT: [[TMP29:%.*]] = bitcast float* [[TMP28]] to <vscale x 8 x float>*
-; CHECK-ORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP29]], align 4
-; CHECK-ORDERED-NEXT: [[TMP30:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-ORDERED-NEXT: [[TMP31:%.*]] = mul i32 [[TMP30]], 16
-; CHECK-ORDERED-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 [[TMP31]]
-; CHECK-ORDERED-NEXT: [[TMP33:%.*]] = bitcast float* [[TMP32]] to <vscale x 8 x float>*
-; CHECK-ORDERED-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP33]], align 4
-; CHECK-ORDERED-NEXT: [[TMP34:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-ORDERED-NEXT: [[TMP35:%.*]] = mul i32 [[TMP34]], 24
-; CHECK-ORDERED-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 [[TMP35]]
-; CHECK-ORDERED-NEXT: [[TMP37:%.*]] = bitcast float* [[TMP36]] to <vscale x 8 x float>*
-; CHECK-ORDERED-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP37]], align 4
-; CHECK-ORDERED-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP4]]
-; CHECK-ORDERED-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP9]]
-; CHECK-ORDERED-NEXT: [[TMP40:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP14]]
-; CHECK-ORDERED-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP19]]
-; CHECK-ORDERED-NEXT: [[TMP42:%.*]] = getelementptr inbounds float, float* [[TMP38]], i32 0
-; CHECK-ORDERED-NEXT: [[TMP43:%.*]] = bitcast float* [[TMP42]] to <vscale x 8 x float>*
-; CHECK-ORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP43]], align 4
-; CHECK-ORDERED-NEXT: [[TMP44:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-ORDERED-NEXT: [[TMP45:%.*]] = mul i32 [[TMP44]], 8
-; CHECK-ORDERED-NEXT: [[TMP46:%.*]] = getelementptr inbounds float, float* [[TMP38]], i32 [[TMP45]]
-; CHECK-ORDERED-NEXT: [[TMP47:%.*]] = bitcast float* [[TMP46]] to <vscale x 8 x float>*
-; CHECK-ORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP47]], align 4
-; CHECK-ORDERED-NEXT: [[TMP48:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-ORDERED-NEXT: [[TMP49:%.*]] = mul i32 [[TMP48]], 16
-; CHECK-ORDERED-NEXT: [[TMP50:%.*]] = getelementptr inbounds float, float* [[TMP38]], i32 [[TMP49]]
-; CHECK-ORDERED-NEXT: [[TMP51:%.*]] = bitcast float* [[TMP50]] to <vscale x 8 x float>*
-; CHECK-ORDERED-NEXT: [[WIDE_LOAD6:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP51]], align 4
-; CHECK-ORDERED-NEXT: [[TMP52:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-ORDERED-NEXT: [[TMP53:%.*]] = mul i32 [[TMP52]], 24
-; CHECK-ORDERED-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, float* [[TMP38]], i32 [[TMP53]]
-; CHECK-ORDERED-NEXT: [[TMP55:%.*]] = bitcast float* [[TMP54]] to <vscale x 8 x float>*
-; CHECK-ORDERED-NEXT: [[WIDE_LOAD7:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP55]], align 4
-; CHECK-ORDERED-NEXT: [[TMP56:%.*]] = fmul <vscale x 8 x float> [[WIDE_LOAD]], [[WIDE_LOAD4]]
-; CHECK-ORDERED-NEXT: [[TMP57:%.*]] = fmul <vscale x 8 x float> [[WIDE_LOAD1]], [[WIDE_LOAD5]]
-; CHECK-ORDERED-NEXT: [[TMP58:%.*]] = fmul <vscale x 8 x float> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
-; CHECK-ORDERED-NEXT: [[TMP59:%.*]] = fmul <vscale x 8 x float> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
-; CHECK-ORDERED-NEXT: [[TMP60:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[TMP56]])
-; CHECK-ORDERED-NEXT: [[TMP61:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP60]], <vscale x 8 x float> [[TMP57]])
-; CHECK-ORDERED-NEXT: [[TMP62:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP61]], <vscale x 8 x float> [[TMP58]])
-; CHECK-ORDERED-NEXT: [[TMP63]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP62]], <vscale x 8 x float> [[TMP59]])
-; CHECK-ORDERED-NEXT: [[TMP64:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT: [[TMP65:%.*]] = mul i64 [[TMP64]], 32
-; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP65]]
-; CHECK-ORDERED-NEXT: [[TMP66:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-ORDERED-NEXT: br i1 [[TMP66]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
-; CHECK-ORDERED: middle.block:
-; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-ORDERED: scalar.ph:
-; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP63]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED: for.body:
-; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-ORDERED-NEXT: [[TMP67:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]]
-; CHECK-ORDERED-NEXT: [[TMP68:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-ORDERED-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP67]], float [[TMP68]], float [[SUM_07]])
-; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-; CHECK-ORDERED: for.end:
-; CHECK-ORDERED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP63]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT: ret float [[MULADD_LCSSA]]
-;
-
-
+; CHECK-ORDERED-LABEL: @fmuladd_strict
+; CHECK-ORDERED: vector.body:
+; CHECK-ORDERED: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[RDX3:%.*]], %vector.body ]
+; CHECK-ORDERED: [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD6:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD7:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[FMUL:%.*]] = fmul <vscale x 8 x float> [[WIDE_LOAD]], [[WIDE_LOAD4]]
+; CHECK-ORDERED: [[FMUL1:%.*]] = fmul <vscale x 8 x float> [[WIDE_LOAD1]], [[WIDE_LOAD5]]
+; CHECK-ORDERED: [[FMUL2:%.*]] = fmul <vscale x 8 x float> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
+; CHECK-ORDERED: [[FMUL3:%.*]] = fmul <vscale x 8 x float> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
+; CHECK-ORDERED: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[FMUL]])
+; CHECK-ORDERED: [[RDX1:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX]], <vscale x 8 x float> [[FMUL1]])
+; CHECK-ORDERED: [[RDX2:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX1]], <vscale x 8 x float> [[FMUL2]])
+; CHECK-ORDERED: [[RDX3]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX2]], <vscale x 8 x float> [[FMUL3]])
+; CHECK-ORDERED: for.end
+; CHECK-ORDERED: [[RES:%.*]] = phi float [ [[SCALAR:%.*]], %for.body ], [ [[RDX3]], %middle.block ]
+; CHECK-ORDERED: ret float [[RES]]
+
+; CHECK-UNORDERED-LABEL: @fmuladd_strict
+; CHECK-UNORDERED: vector.body
+; CHECK-UNORDERED: [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), float 0.000000e+00, i32 0), %vector.ph ], [ [[FMULADD:%.*]], %vector.body ]
+; CHECK-UNORDERED: [[VEC_PHI1:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), %vector.ph ], [ [[FMULADD1:%.*]], %vector.body ]
+; CHECK-UNORDERED: [[VEC_PHI2:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), %vector.ph ], [ [[FMULADD2:%.*]], %vector.body ]
+; CHECK-UNORDERED: [[VEC_PHI3:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), %vector.ph ], [ [[FMULADD3:%.*]], %vector.body ]
+; CHECK-UNORDERED: [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD6:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD7:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[FMULADD]] = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD]], <vscale x 8 x float> [[WIDE_LOAD4]], <vscale x 8 x float> [[VEC_PHI]])
+; CHECK-UNORDERED: [[FMULADD1]] = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD1]], <vscale x 8 x float> [[WIDE_LOAD5]], <vscale x 8 x float> [[VEC_PHI1]])
+; CHECK-UNORDERED: [[FMULADD2]] = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD2]], <vscale x 8 x float> [[WIDE_LOAD6]], <vscale x 8 x float> [[VEC_PHI2]])
+; CHECK-UNORDERED: [[FMULADD3]] = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD3]], <vscale x 8 x float> [[WIDE_LOAD7]], <vscale x 8 x float> [[VEC_PHI3]])
+; CHECK-UNORDERED-NOT: llvm.vector.reduce.fadd
+; CHECK-UNORDERED: middle.block
+; CHECK-UNORDERED: [[BIN_RDX:%.*]] = fadd <vscale x 8 x float> [[FMULADD1]], [[FMULADD]]
+; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd <vscale x 8 x float> [[FMULADD2]], [[BIN_RDX]]
+; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd <vscale x 8 x float> [[FMULADD3]], [[BIN_RDX1]]
+; CHECK-UNORDERED: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> [[BIN_RDX2]]
+; CHECK-UNORDERED: for.body
+; CHECK-UNORDERED: [[SUM_07:%.*]] = phi float [ [[SCALAR:%.*]], %scalar.ph ], [ [[MULADD:%.*]], %for.body ]
+; CHECK-UNORDERED: [[LOAD:%.*]] = load float, float*
+; CHECK-UNORDERED: [[LOAD1:%.*]] = load float, float*
+; CHECK-UNORDERED: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD1]], float [[SUM_07]])
+; CHECK-UNORDERED: for.end
+; CHECK-UNORDERED: [[RES:%.*]] = phi float [ [[MULADD]], %for.body ], [ [[RDX]], %middle.block ]
+; CHECK-UNORDERED: ret float [[RES]]
+
+; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_strict
+; CHECK-NOT-VECTORIZED-NOT: vector.body
entry:
br label %for.body
; Same as above but where the call to the llvm.fmuladd intrinsic has a fast-math flag.
define float @fmuladd_strict_fmf(float* %a, float* %b, i64 %n) #0 {
-; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_strict_fmf(
-; CHECK-NOT-VECTORIZED-NEXT: entry:
-; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED: for.body:
-; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[MULADD]] = tail call nnan float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[SUM_07]])
-; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]]
-; CHECK-NOT-VECTORIZED: for.end:
-; CHECK-NOT-VECTORIZED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: ret float [[MULADD_LCSSA]]
-;
-; CHECK-UNORDERED-LABEL: @fmuladd_strict_fmf(
-; CHECK-UNORDERED-NEXT: entry:
-; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 32
-; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-UNORDERED: vector.ph:
-; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 32
-; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-UNORDERED: vector.body:
-; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP56:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP57:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI2:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP58:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI3:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP59:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 16
-; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 0
-; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 1
-; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], [[TMP13]]
-; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 24
-; CHECK-UNORDERED-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 0
-; CHECK-UNORDERED-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 1
-; CHECK-UNORDERED-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], [[TMP18]]
-; CHECK-UNORDERED-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP4]]
-; CHECK-UNORDERED-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP9]]
-; CHECK-UNORDERED-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP14]]
-; CHECK-UNORDERED-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP19]]
-; CHECK-UNORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 0
-; CHECK-UNORDERED-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to <vscale x 8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP25]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP26:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-UNORDERED-NEXT: [[TMP27:%.*]] = mul i32 [[TMP26]], 8
-; CHECK-UNORDERED-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 [[TMP27]]
-; CHECK-UNORDERED-NEXT: [[TMP29:%.*]] = bitcast float* [[TMP28]] to <vscale x 8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP29]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP30:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-UNORDERED-NEXT: [[TMP31:%.*]] = mul i32 [[TMP30]], 16
-; CHECK-UNORDERED-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 [[TMP31]]
-; CHECK-UNORDERED-NEXT: [[TMP33:%.*]] = bitcast float* [[TMP32]] to <vscale x 8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP33]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP34:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-UNORDERED-NEXT: [[TMP35:%.*]] = mul i32 [[TMP34]], 24
-; CHECK-UNORDERED-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 [[TMP35]]
-; CHECK-UNORDERED-NEXT: [[TMP37:%.*]] = bitcast float* [[TMP36]] to <vscale x 8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD6:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP37]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP4]]
-; CHECK-UNORDERED-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP9]]
-; CHECK-UNORDERED-NEXT: [[TMP40:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP14]]
-; CHECK-UNORDERED-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP19]]
-; CHECK-UNORDERED-NEXT: [[TMP42:%.*]] = getelementptr inbounds float, float* [[TMP38]], i32 0
-; CHECK-UNORDERED-NEXT: [[TMP43:%.*]] = bitcast float* [[TMP42]] to <vscale x 8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD7:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP43]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP44:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-UNORDERED-NEXT: [[TMP45:%.*]] = mul i32 [[TMP44]], 8
-; CHECK-UNORDERED-NEXT: [[TMP46:%.*]] = getelementptr inbounds float, float* [[TMP38]], i32 [[TMP45]]
-; CHECK-UNORDERED-NEXT: [[TMP47:%.*]] = bitcast float* [[TMP46]] to <vscale x 8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD8:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP47]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP48:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-UNORDERED-NEXT: [[TMP49:%.*]] = mul i32 [[TMP48]], 16
-; CHECK-UNORDERED-NEXT: [[TMP50:%.*]] = getelementptr inbounds float, float* [[TMP38]], i32 [[TMP49]]
-; CHECK-UNORDERED-NEXT: [[TMP51:%.*]] = bitcast float* [[TMP50]] to <vscale x 8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD9:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP51]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP52:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-UNORDERED-NEXT: [[TMP53:%.*]] = mul i32 [[TMP52]], 24
-; CHECK-UNORDERED-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, float* [[TMP38]], i32 [[TMP53]]
-; CHECK-UNORDERED-NEXT: [[TMP55:%.*]] = bitcast float* [[TMP54]] to <vscale x 8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD10:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP55]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP56]] = call nnan <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD]], <vscale x 8 x float> [[WIDE_LOAD7]], <vscale x 8 x float> [[VEC_PHI]])
-; CHECK-UNORDERED-NEXT: [[TMP57]] = call nnan <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD4]], <vscale x 8 x float> [[WIDE_LOAD8]], <vscale x 8 x float> [[VEC_PHI1]])
-; CHECK-UNORDERED-NEXT: [[TMP58]] = call nnan <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD5]], <vscale x 8 x float> [[WIDE_LOAD9]], <vscale x 8 x float> [[VEC_PHI2]])
-; CHECK-UNORDERED-NEXT: [[TMP59]] = call nnan <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD6]], <vscale x 8 x float> [[WIDE_LOAD10]], <vscale x 8 x float> [[VEC_PHI3]])
-; CHECK-UNORDERED-NEXT: [[TMP60:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-UNORDERED-NEXT: [[TMP61:%.*]] = mul i64 [[TMP60]], 32
-; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP61]]
-; CHECK-UNORDERED-NEXT: [[TMP62:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT: br i1 [[TMP62]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
-; CHECK-UNORDERED: middle.block:
-; CHECK-UNORDERED-NEXT: [[BIN_RDX:%.*]] = fadd nnan <vscale x 8 x float> [[TMP57]], [[TMP56]]
-; CHECK-UNORDERED-NEXT: [[BIN_RDX11:%.*]] = fadd nnan <vscale x 8 x float> [[TMP58]], [[BIN_RDX]]
-; CHECK-UNORDERED-NEXT: [[BIN_RDX12:%.*]] = fadd nnan <vscale x 8 x float> [[TMP59]], [[BIN_RDX11]]
-; CHECK-UNORDERED-NEXT: [[TMP63:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> [[BIN_RDX12]])
-; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-UNORDERED: scalar.ph:
-; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP63]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED: for.body:
-; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT: [[TMP64:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT: [[TMP65:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-UNORDERED-NEXT: [[MULADD]] = tail call nnan float @llvm.fmuladd.f32(float [[TMP64]], float [[TMP65]], float [[SUM_07]])
-; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-; CHECK-UNORDERED: for.end:
-; CHECK-UNORDERED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP63]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: ret float [[MULADD_LCSSA]]
-;
-; CHECK-ORDERED-LABEL: @fmuladd_strict_fmf(
-; CHECK-ORDERED-NEXT: entry:
-; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 32
-; CHECK-ORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-ORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-ORDERED: vector.ph:
-; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 32
-; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-ORDERED: vector.body:
-; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP63:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 16
-; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 0
-; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 1
-; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], [[TMP13]]
-; CHECK-ORDERED-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 24
-; CHECK-ORDERED-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 0
-; CHECK-ORDERED-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 1
-; CHECK-ORDERED-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], [[TMP18]]
-; CHECK-ORDERED-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP4]]
-; CHECK-ORDERED-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP9]]
-; CHECK-ORDERED-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP14]]
-; CHECK-ORDERED-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP19]]
-; CHECK-ORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 0
-; CHECK-ORDERED-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to <vscale x 8 x float>*
-; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP25]], align 4
-; CHECK-ORDERED-NEXT: [[TMP26:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-ORDERED-NEXT: [[TMP27:%.*]] = mul i32 [[TMP26]], 8
-; CHECK-ORDERED-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 [[TMP27]]
-; CHECK-ORDERED-NEXT: [[TMP29:%.*]] = bitcast float* [[TMP28]] to <vscale x 8 x float>*
-; CHECK-ORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP29]], align 4
-; CHECK-ORDERED-NEXT: [[TMP30:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-ORDERED-NEXT: [[TMP31:%.*]] = mul i32 [[TMP30]], 16
-; CHECK-ORDERED-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 [[TMP31]]
-; CHECK-ORDERED-NEXT: [[TMP33:%.*]] = bitcast float* [[TMP32]] to <vscale x 8 x float>*
-; CHECK-ORDERED-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP33]], align 4
-; CHECK-ORDERED-NEXT: [[TMP34:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-ORDERED-NEXT: [[TMP35:%.*]] = mul i32 [[TMP34]], 24
-; CHECK-ORDERED-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 [[TMP35]]
-; CHECK-ORDERED-NEXT: [[TMP37:%.*]] = bitcast float* [[TMP36]] to <vscale x 8 x float>*
-; CHECK-ORDERED-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP37]], align 4
-; CHECK-ORDERED-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP4]]
-; CHECK-ORDERED-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP9]]
-; CHECK-ORDERED-NEXT: [[TMP40:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP14]]
-; CHECK-ORDERED-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP19]]
-; CHECK-ORDERED-NEXT: [[TMP42:%.*]] = getelementptr inbounds float, float* [[TMP38]], i32 0
-; CHECK-ORDERED-NEXT: [[TMP43:%.*]] = bitcast float* [[TMP42]] to <vscale x 8 x float>*
-; CHECK-ORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP43]], align 4
-; CHECK-ORDERED-NEXT: [[TMP44:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-ORDERED-NEXT: [[TMP45:%.*]] = mul i32 [[TMP44]], 8
-; CHECK-ORDERED-NEXT: [[TMP46:%.*]] = getelementptr inbounds float, float* [[TMP38]], i32 [[TMP45]]
-; CHECK-ORDERED-NEXT: [[TMP47:%.*]] = bitcast float* [[TMP46]] to <vscale x 8 x float>*
-; CHECK-ORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP47]], align 4
-; CHECK-ORDERED-NEXT: [[TMP48:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-ORDERED-NEXT: [[TMP49:%.*]] = mul i32 [[TMP48]], 16
-; CHECK-ORDERED-NEXT: [[TMP50:%.*]] = getelementptr inbounds float, float* [[TMP38]], i32 [[TMP49]]
-; CHECK-ORDERED-NEXT: [[TMP51:%.*]] = bitcast float* [[TMP50]] to <vscale x 8 x float>*
-; CHECK-ORDERED-NEXT: [[WIDE_LOAD6:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP51]], align 4
-; CHECK-ORDERED-NEXT: [[TMP52:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-ORDERED-NEXT: [[TMP53:%.*]] = mul i32 [[TMP52]], 24
-; CHECK-ORDERED-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, float* [[TMP38]], i32 [[TMP53]]
-; CHECK-ORDERED-NEXT: [[TMP55:%.*]] = bitcast float* [[TMP54]] to <vscale x 8 x float>*
-; CHECK-ORDERED-NEXT: [[WIDE_LOAD7:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP55]], align 4
-; CHECK-ORDERED-NEXT: [[TMP56:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_LOAD]], [[WIDE_LOAD4]]
-; CHECK-ORDERED-NEXT: [[TMP57:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_LOAD1]], [[WIDE_LOAD5]]
-; CHECK-ORDERED-NEXT: [[TMP58:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
-; CHECK-ORDERED-NEXT: [[TMP59:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
-; CHECK-ORDERED-NEXT: [[TMP60:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[TMP56]])
-; CHECK-ORDERED-NEXT: [[TMP61:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP60]], <vscale x 8 x float> [[TMP57]])
-; CHECK-ORDERED-NEXT: [[TMP62:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP61]], <vscale x 8 x float> [[TMP58]])
-; CHECK-ORDERED-NEXT: [[TMP63]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP62]], <vscale x 8 x float> [[TMP59]])
-; CHECK-ORDERED-NEXT: [[TMP64:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-ORDERED-NEXT: [[TMP65:%.*]] = mul i64 [[TMP64]], 32
-; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP65]]
-; CHECK-ORDERED-NEXT: [[TMP66:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-ORDERED-NEXT: br i1 [[TMP66]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
-; CHECK-ORDERED: middle.block:
-; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-ORDERED: scalar.ph:
-; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP63]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED: for.body:
-; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-ORDERED-NEXT: [[TMP67:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]]
-; CHECK-ORDERED-NEXT: [[TMP68:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-ORDERED-NEXT: [[MULADD]] = tail call nnan float @llvm.fmuladd.f32(float [[TMP67]], float [[TMP68]], float [[SUM_07]])
-; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
-; CHECK-ORDERED: for.end:
-; CHECK-ORDERED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP63]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT: ret float [[MULADD_LCSSA]]
-;
-
-
+; CHECK-ORDERED-LABEL: @fmuladd_strict_fmf
+; CHECK-ORDERED: vector.body:
+; CHECK-ORDERED: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[RDX3:%.*]], %vector.body ]
+; CHECK-ORDERED: [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD6:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD7:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[FMUL:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_LOAD]], [[WIDE_LOAD4]]
+; CHECK-ORDERED: [[FMUL1:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_LOAD1]], [[WIDE_LOAD5]]
+; CHECK-ORDERED: [[FMUL2:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
+; CHECK-ORDERED: [[FMUL3:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
+; CHECK-ORDERED: [[RDX:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[FMUL]])
+; CHECK-ORDERED: [[RDX1:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX]], <vscale x 8 x float> [[FMUL1]])
+; CHECK-ORDERED: [[RDX2:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX1]], <vscale x 8 x float> [[FMUL2]])
+; CHECK-ORDERED: [[RDX3]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX2]], <vscale x 8 x float> [[FMUL3]])
+; CHECK-ORDERED: for.end
+; CHECK-ORDERED: [[RES:%.*]] = phi float [ [[SCALAR:%.*]], %for.body ], [ [[RDX3]], %middle.block ]
+; CHECK-ORDERED: ret float [[RES]]
+
+; CHECK-UNORDERED-LABEL: @fmuladd_strict_fmf
+; CHECK-UNORDERED: vector.body
+; CHECK-UNORDERED: [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), float 0.000000e+00, i32 0), %vector.ph ], [ [[FMULADD:%.*]], %vector.body ]
+; CHECK-UNORDERED: [[VEC_PHI1:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), %vector.ph ], [ [[FMULADD1:%.*]], %vector.body ]
+; CHECK-UNORDERED: [[VEC_PHI2:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), %vector.ph ], [ [[FMULADD2:%.*]], %vector.body ]
+; CHECK-UNORDERED: [[VEC_PHI3:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), %vector.ph ], [ [[FMULADD3:%.*]], %vector.body ]
+; CHECK-UNORDERED: [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD6:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD7:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[FMULADD]] = call nnan <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD]], <vscale x 8 x float> [[WIDE_LOAD4]], <vscale x 8 x float> [[VEC_PHI]])
+; CHECK-UNORDERED: [[FMULADD1]] = call nnan <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD1]], <vscale x 8 x float> [[WIDE_LOAD5]], <vscale x 8 x float> [[VEC_PHI1]])
+; CHECK-UNORDERED: [[FMULADD2]] = call nnan <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD2]], <vscale x 8 x float> [[WIDE_LOAD6]], <vscale x 8 x float> [[VEC_PHI2]])
+; CHECK-UNORDERED: [[FMULADD3]] = call nnan <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD3]], <vscale x 8 x float> [[WIDE_LOAD7]], <vscale x 8 x float> [[VEC_PHI3]])
+; CHECK-UNORDERED-NOT: llvm.vector.reduce.fadd
+; CHECK-UNORDERED: middle.block
+; CHECK-UNORDERED: [[BIN_RDX:%.*]] = fadd nnan <vscale x 8 x float> [[FMULADD1]], [[FMULADD]]
+; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd nnan <vscale x 8 x float> [[FMULADD2]], [[BIN_RDX]]
+; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd nnan <vscale x 8 x float> [[FMULADD3]], [[BIN_RDX1]]
+; CHECK-UNORDERED: [[RDX:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> [[BIN_RDX2]]
+; CHECK-UNORDERED: for.body
+; CHECK-UNORDERED: [[SUM_07:%.*]] = phi float [ [[SCALAR:%.*]], %scalar.ph ], [ [[MULADD:%.*]], %for.body ]
+; CHECK-UNORDERED: [[LOAD:%.*]] = load float, float*
+; CHECK-UNORDERED: [[LOAD1:%.*]] = load float, float*
+; CHECK-UNORDERED: [[MULADD]] = tail call nnan float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD1]], float [[SUM_07]])
+; CHECK-UNORDERED: for.end
+; CHECK-UNORDERED: [[RES:%.*]] = phi float [ [[MULADD]], %for.body ], [ [[RDX]], %middle.block ]
+; CHECK-UNORDERED: ret float [[RES]]
+
+; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_strict_fmf
+; CHECK-NOT-VECTORIZED-NOT: vector.body
entry:
br label %for.body
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -loop-vectorize -force-vector-width=1 -force-vector-interleave=2 -S -o - < %s | FileCheck %s
; RUN: opt -mattr=+sve -loop-vectorize -force-vector-width=1 -force-vector-interleave=2 -S -o - < %s | FileCheck %s
define void @foo(i32* %data1, i32* %data2) {
; CHECK-LABEL: @foo(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ]
-; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
-; CHECK-NEXT: [[INDUCTION:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-NEXT: [[INDUCTION1:%.*]] = add i64 [[OFFSET_IDX]], -1
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[DATA1:%.*]], i64 [[INDUCTION]]
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[DATA1]], i64 [[INDUCTION1]]
-; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]], align 4
-; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 4
-; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], [[TMP2]]
-; CHECK-NEXT: [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], [[TMP3]]
-; CHECK-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK: br i1 {{%.*}}, label %pred.store.if, label %pred.store.continue
; CHECK: pred.store.if:
-; CHECK-NEXT: store i32 [[TMP2]], i32* [[TMP0]], align 4
-; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]]
+; CHECK-NEXT: store i32 {{%.*}}, i32* {{%.*}}
+; CHECK-NEXT: br label %pred.store.continue
; CHECK: pred.store.continue:
-; CHECK-NEXT: br i1 [[TMP5]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]]
+; CHECK-NEXT: br i1 {{%.*}}, label %pred.store.if2, label %pred.store.continue3
; CHECK: pred.store.if2:
-; CHECK-NEXT: store i32 [[TMP3]], i32* [[TMP1]], align 4
-; CHECK-NEXT: br label [[PRED_STORE_CONTINUE3]]
+; CHECK-NEXT: store i32 {{%.*}}, i32* {{%.*}}
+; CHECK-NEXT: br label %pred.store.continue3
; CHECK: pred.store.continue3:
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, 1024
-; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, [[MIDDLE_BLOCK]] ], [ 1023, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[WHILE_BODY:%.*]]
-; CHECK: while.body:
-; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[IF_END:%.*]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[DATA1]], i64 [[I]]
-; CHECK-NEXT: [[LD:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[LD]], [[LD]]
-; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END]]
-; CHECK: if.then:
-; CHECK-NEXT: store i32 [[LD]], i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: br label [[IF_END]]
-; CHECK: if.end:
-; CHECK-NEXT: [[I_NEXT]] = add nsw i64 [[I]], -1
-; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i64 [[I]], 0
-; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: while.end:
-; CHECK-NEXT: ret void
-;
entry:
br label %while.body
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; REQUIRES: asserts
; RUN: opt < %s -loop-vectorize -debug-only=loop-vectorize -S 2>&1 | FileCheck %s
target triple = "arm64-apple-ios5.0.0"
define void @selects_1(i32* nocapture %dst, i32 %A, i32 %B, i32 %C, i32 %N) {
-; CHECK-LABEL: @selects_1(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP26:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; CHECK-NEXT: br i1 [[CMP26]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK: for.body.preheader:
-; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 8
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[A:%.*]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT2]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i32> poison, i32 [[B:%.*]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT4]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x i32> poison, i32 [[B]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT6]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <4 x i32> poison, i32 [[C]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT11:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT10]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 4
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4
-; CHECK-NEXT: [[TMP8:%.*]] = and <4 x i32> [[WIDE_LOAD]], <i32 2047, i32 2047, i32 2047, i32 2047>
-; CHECK-NEXT: [[TMP9:%.*]] = and <4 x i32> [[WIDE_LOAD1]], <i32 2047, i32 2047, i32 2047, i32 2047>
-; CHECK-NEXT: [[TMP10:%.*]] = icmp eq <4 x i32> [[TMP8]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq <4 x i32> [[TMP9]], [[BROADCAST_SPLAT3]]
-; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> <i32 10, i32 10, i32 10, i32 10>, <4 x i32> [[TMP8]]
-; CHECK-NEXT: [[TMP13:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> <i32 10, i32 10, i32 10, i32 10>, <4 x i32> [[TMP9]]
-; CHECK-NEXT: [[TMP14:%.*]] = icmp eq <4 x i32> [[TMP8]], [[BROADCAST_SPLAT5]]
-; CHECK-NEXT: [[TMP15:%.*]] = icmp eq <4 x i32> [[TMP9]], [[BROADCAST_SPLAT7]]
-; CHECK-NEXT: [[TMP16:%.*]] = select <4 x i1> [[TMP14]], <4 x i32> <i32 30, i32 30, i32 30, i32 30>, <4 x i32> [[TMP8]]
-; CHECK-NEXT: [[TMP17:%.*]] = select <4 x i1> [[TMP15]], <4 x i32> <i32 30, i32 30, i32 30, i32 30>, <4 x i32> [[TMP9]]
-; CHECK-NEXT: [[TMP18:%.*]] = icmp ugt <4 x i32> [[TMP12]], [[BROADCAST_SPLAT9]]
-; CHECK-NEXT: [[TMP19:%.*]] = icmp ugt <4 x i32> [[TMP13]], [[BROADCAST_SPLAT11]]
-; CHECK-NEXT: [[TMP20:%.*]] = select <4 x i1> [[TMP18]], <4 x i32> [[TMP12]], <4 x i32> [[TMP16]]
-; CHECK-NEXT: [[TMP21:%.*]] = select <4 x i1> [[TMP19]], <4 x i32> [[TMP13]], <4 x i32> [[TMP17]]
-; CHECK-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
-; CHECK-NEXT: store <4 x i32> [[TMP20]], <4 x i32>* [[TMP22]], align 4
-; CHECK-NEXT: [[TMP23:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; CHECK-NEXT: store <4 x i32> [[TMP21]], <4 x i32>* [[TMP23]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP25:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[AND:%.*]] = and i32 [[TMP25]], 2047
-; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[AND]], [[A]]
-; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP1]], i32 10, i32 [[AND]]
-; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i32 [[AND]], [[B]]
-; CHECK-NEXT: [[COND6:%.*]] = select i1 [[CMP2]], i32 30, i32 [[AND]]
-; CHECK-NEXT: [[CMP7:%.*]] = icmp ugt i32 [[COND]], [[C]]
-; CHECK-NEXT: [[COND11:%.*]] = select i1 [[CMP7]], i32 [[COND]], i32 [[COND6]]
-; CHECK-NEXT: store i32 [[COND11]], i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: for.cond.cleanup.loopexit:
-; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: ret void
-;
+; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction: %cond = select i1 %cmp1, i32 10, i32 %and
+; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction: %cond6 = select i1 %cmp2, i32 30, i32 %and
+; CHECK: LV: Found an estimated cost of 1 for VF 2 For instruction: %cond11 = select i1 %cmp7, i32 %cond, i32 %cond6
+; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction: %cond = select i1 %cmp1, i32 10, i32 %and
+; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction: %cond6 = select i1 %cmp2, i32 30, i32 %and
+; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction: %cond11 = select i1 %cmp7, i32 %cond, i32 %cond6
+; CHECK-LABEL: define void @selects_1(
+; CHECK: vector.body:
+; CHECK: select <4 x i1>
entry:
%cmp26 = icmp sgt i32 %N, 0
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; REQUIRES: asserts
; RUN: opt -loop-vectorize -force-ordered-reductions=true -force-vector-width=1 -S < %s -debug 2> %t.debug | FileCheck %s
; RUN: cat %t.debug | FileCheck %s --check-prefix=CHECK-DEBUG
define void @foo(float* noalias nocapture %dst, float* noalias nocapture readonly %src, i64 %M, i64 %N) {
; CHECK-LABEL: @foo(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[FOR_BODY_US:%.*]]
-; CHECK: for.body.us:
-; CHECK-NEXT: [[I_023_US:%.*]] = phi i64 [ [[INC8_US:%.*]], [[FOR_COND3:%.*]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 [[I_023_US]]
-; CHECK-NEXT: [[MUL_US:%.*]] = mul nsw i64 [[I_023_US]], [[N:%.*]]
-; CHECK-NEXT: br label [[FOR_BODY3_US:%.*]]
-; CHECK: for.body3.us:
-; CHECK-NEXT: [[TMP0:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_US]] ], [ [[ADD6_US:%.*]], [[FOR_BODY3_US]] ]
-; CHECK-NEXT: [[J_021_US:%.*]] = phi i64 [ 0, [[FOR_BODY_US]] ], [ [[INC_US:%.*]], [[FOR_BODY3_US]] ]
-; CHECK-NEXT: [[ADD_US:%.*]] = add nsw i64 [[J_021_US]], [[MUL_US]]
-; CHECK-NEXT: [[ARRAYIDX4_US:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 [[ADD_US]]
-; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX4_US]], align 4
-; CHECK-NEXT: [[ADD6_US]] = fadd float [[TMP1]], [[TMP0]]
-; CHECK-NEXT: [[INC_US]] = add nuw nsw i64 [[J_021_US]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC_US]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND3]], label [[FOR_BODY3_US]]
-; CHECK: for.cond3:
-; CHECK-NEXT: [[ADD6_US_LCSSA:%.*]] = phi float [ [[ADD6_US]], [[FOR_BODY3_US]] ]
-; CHECK-NEXT: store float [[ADD6_US_LCSSA]], float* [[ARRAYIDX_US]], align 4
-; CHECK-NEXT: [[INC8_US]] = add nuw nsw i64 [[I_023_US]], 1
-; CHECK-NEXT: [[EXITCOND26_NOT:%.*]] = icmp eq i64 [[INC8_US]], [[M:%.*]]
-; CHECK-NEXT: br i1 [[EXITCOND26_NOT]], label [[EXIT:%.*]], label [[FOR_BODY_US]]
-; CHECK: exit:
-; CHECK-NEXT: ret void
-;
+; CHECK-NOT: vector.body
entry:
br label %for.body.us
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -force-ordered-reductions=false -hints-allow-reordering=false -S 2>%t | FileCheck %s --check-prefix=CHECK-NOT-VECTORIZED
; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -force-ordered-reductions=false -hints-allow-reordering=true -S 2>%t | FileCheck %s --check-prefix=CHECK-UNORDERED
; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -force-ordered-reductions=true -hints-allow-reordering=false -S 2>%t | FileCheck %s --check-prefix=CHECK-ORDERED
; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -hints-allow-reordering=false -S 2>%t | FileCheck %s --check-prefix=CHECK-ORDERED
define float @fadd_strict(float* noalias nocapture readonly %a, i64 %n) {
-; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict(
-; CHECK-NOT-VECTORIZED-NEXT: entry:
-; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED: for.body:
-; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[ADD]] = fadd float [[TMP0]], [[SUM_07]]
-; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK-NOT-VECTORIZED: for.end:
-; CHECK-NOT-VECTORIZED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: ret float [[ADD_LCSSA]]
-;
-; CHECK-UNORDERED-LABEL: @fadd_strict(
-; CHECK-UNORDERED-NEXT: entry:
-; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-UNORDERED: vector.ph:
-; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-UNORDERED: vector.body:
-; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
-; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
-; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP4]] = fadd <8 x float> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK-UNORDERED: middle.block:
-; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP4]])
-; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-UNORDERED: scalar.ph:
-; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED: for.body:
-; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT: [[ADD]] = fadd float [[TMP7]], [[SUM_07]]
-; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK-UNORDERED: for.end:
-; CHECK-UNORDERED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: ret float [[ADD_LCSSA]]
-;
-; CHECK-ORDERED-LABEL: @fadd_strict(
-; CHECK-ORDERED-NEXT: entry:
-; CHECK-ORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-ORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-ORDERED: vector.ph:
-; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-ORDERED: vector.body:
-; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
-; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
-; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>*
-; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4
-; CHECK-ORDERED-NEXT: [[TMP4]] = call float @llvm.vector.reduce.fadd.v8f32(float [[VEC_PHI]], <8 x float> [[WIDE_LOAD]])
-; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-ORDERED-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK-ORDERED: middle.block:
-; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-ORDERED: scalar.ph:
-; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED: for.body:
-; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT: [[ADD]] = fadd float [[TMP6]], [[SUM_07]]
-; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK-ORDERED: for.end:
-; CHECK-ORDERED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT: ret float [[ADD_LCSSA]]
-;
-
-
+; CHECK-ORDERED-LABEL: @fadd_strict
+; CHECK-ORDERED: vector.body:
+; CHECK-ORDERED: %[[VEC_PHI:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX:.*]], %vector.body ]
+; CHECK-ORDERED: %[[LOAD:.*]] = load <8 x float>, <8 x float>*
+; CHECK-ORDERED: %[[RDX]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[VEC_PHI]], <8 x float> %[[LOAD]])
+; CHECK-ORDERED: for.end
+; CHECK-ORDERED: %[[PHI:.*]] = phi float [ %[[SCALAR:.*]], %for.body ], [ %[[RDX]], %middle.block ]
+; CHECK-ORDERED: ret float %[[PHI]]
+
+; CHECK-UNORDERED-LABEL: @fadd_strict
+; CHECK-UNORDERED: vector.body
+; CHECK-UNORDERED: %[[VEC_PHI:.*]] = phi <8 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ %[[FADD_VEC:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[LOAD_VEC:.*]] = load <8 x float>, <8 x float>*
+; CHECK-UNORDERED: %[[FADD_VEC]] = fadd <8 x float> %[[LOAD_VEC]], %[[VEC_PHI]]
+; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd
+; CHECK-UNORDERED: middle.block
+; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> %[[FADD_VEC]])
+; CHECK-UNORDERED: for.body
+; CHECK-UNORDERED: %[[LOAD:.*]] = load float, float*
+; CHECK-UNORDERED: %[[FADD:.*]] = fadd float %[[LOAD]], {{.*}}
+; CHECK-UNORDERED: for.end
+; CHECK-UNORDERED: %[[RES:.*]] = phi float [ %[[FADD]], %for.body ], [ %[[RDX]], %middle.block ]
+; CHECK-UNORDERED: ret float %[[RES]]
+
+; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict
+; CHECK-NOT-VECTORIZED-NOT: vector.body
entry:
br label %for.body
; Same as above but where fadd has a fast-math flag.
define float @fadd_strict_fmf(float* noalias nocapture readonly %a, i64 %n) {
-; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict_fmf(
-; CHECK-NOT-VECTORIZED-NEXT: entry:
-; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED: for.body:
-; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[ADD]] = fadd nnan float [[TMP0]], [[SUM_07]]
-; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]]
-; CHECK-NOT-VECTORIZED: for.end:
-; CHECK-NOT-VECTORIZED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: ret float [[ADD_LCSSA]]
-;
-; CHECK-UNORDERED-LABEL: @fadd_strict_fmf(
-; CHECK-UNORDERED-NEXT: entry:
-; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-UNORDERED: vector.ph:
-; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-UNORDERED: vector.body:
-; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
-; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
-; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP4]] = fadd nnan <8 x float> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK-UNORDERED: middle.block:
-; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = call nnan float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP4]])
-; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-UNORDERED: scalar.ph:
-; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED: for.body:
-; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT: [[ADD]] = fadd nnan float [[TMP7]], [[SUM_07]]
-; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK-UNORDERED: for.end:
-; CHECK-UNORDERED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: ret float [[ADD_LCSSA]]
-;
-; CHECK-ORDERED-LABEL: @fadd_strict_fmf(
-; CHECK-ORDERED-NEXT: entry:
-; CHECK-ORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-ORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-ORDERED: vector.ph:
-; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-ORDERED: vector.body:
-; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
-; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
-; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>*
-; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4
-; CHECK-ORDERED-NEXT: [[TMP4]] = call nnan float @llvm.vector.reduce.fadd.v8f32(float [[VEC_PHI]], <8 x float> [[WIDE_LOAD]])
-; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-ORDERED-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK-ORDERED: middle.block:
-; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-ORDERED: scalar.ph:
-; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED: for.body:
-; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT: [[ADD]] = fadd nnan float [[TMP6]], [[SUM_07]]
-; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK-ORDERED: for.end:
-; CHECK-ORDERED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT: ret float [[ADD_LCSSA]]
-;
-
-
+; CHECK-ORDERED-LABEL: @fadd_strict_fmf
+; CHECK-ORDERED: vector.body:
+; CHECK-ORDERED: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[RDX:%.*]], %vector.body ]
+; CHECK-ORDERED: [[LOAD_VEC:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-ORDERED: [[RDX]] = call nnan float @llvm.vector.reduce.fadd.v8f32(float [[VEC_PHI]], <8 x float> [[LOAD_VEC]])
+; CHECK-ORDERED: for.end:
+; CHECK-ORDERED: [[RES:%.*]] = phi float [ [[SCALAR:%.*]], %for.body ], [ [[RDX]], %middle.block ]
+; CHECK-ORDERED: ret float [[RES]]
+
+; CHECK-UNORDERED-LABEL: @fadd_strict_fmf
+; CHECK-UNORDERED: vector.body:
+; CHECK-UNORDERED: [[VEC_PHI:%.*]] = phi <8 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ [[FADD_VEC:%.*]], %vector.body ]
+; CHECK-UNORDERED: [[LOAD_VEC:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-UNORDERED: [[FADD_VEC]] = fadd nnan <8 x float> [[LOAD_VEC]], [[VEC_PHI]]
+; CHECK-UNORDERED-NOT: @llvm.vector.reduce.fadd
+; CHECK-UNORDERED: middle.block:
+; CHECK-UNORDERED: [[RDX:%.*]] = call nnan float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[FADD_VEC]])
+; CHECK-UNORDERED: for.body:
+; CHECK-UNORDERED: [[LOAD:%.*]] = load float, float*
+; CHECK-UNORDERED: [[FADD:%.*]] = fadd nnan float [[LOAD]], {{.*}}
+; CHECK-UNORDERED: for.end:
+; CHECK-UNORDERED: [[RES:%.*]] = phi float [ [[FADD]], %for.body ], [ [[RDX]], %middle.block ]
+; CHECK-UNORDERED: ret float [[RES]]
+
+; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict_fmf
+; CHECK-NOT-VECTORIZED-NOT: vector.body
entry:
br label %for.body
}
define float @fadd_strict_unroll(float* noalias nocapture readonly %a, i64 %n) {
-; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict_unroll(
-; CHECK-NOT-VECTORIZED-NEXT: entry:
-; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED: for.body:
-; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[ADD]] = fadd float [[TMP0]], [[SUM_07]]
-; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK-NOT-VECTORIZED: for.end:
-; CHECK-NOT-VECTORIZED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: ret float [[ADD_LCSSA]]
-;
-; CHECK-UNORDERED-LABEL: @fadd_strict_unroll(
-; CHECK-UNORDERED-NEXT: entry:
-; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 32
-; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-UNORDERED: vector.ph:
-; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 32
-; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-UNORDERED: vector.body:
-; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI1:%.*]] = phi <8 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI2:%.*]] = phi <8 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI3:%.*]] = phi <8 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8
-; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 16
-; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 24
-; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
-; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]]
-; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP2]]
-; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP3]]
-; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
-; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP9]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 8
-; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to <8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x float>, <8 x float>* [[TMP11]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 16
-; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x float>, <8 x float>* [[TMP13]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 24
-; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x float>, <8 x float>* [[TMP15]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP16]] = fadd <8 x float> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-UNORDERED-NEXT: [[TMP17]] = fadd <8 x float> [[WIDE_LOAD4]], [[VEC_PHI1]]
-; CHECK-UNORDERED-NEXT: [[TMP18]] = fadd <8 x float> [[WIDE_LOAD5]], [[VEC_PHI2]]
-; CHECK-UNORDERED-NEXT: [[TMP19]] = fadd <8 x float> [[WIDE_LOAD6]], [[VEC_PHI3]]
-; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; CHECK-UNORDERED-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK-UNORDERED: middle.block:
-; CHECK-UNORDERED-NEXT: [[BIN_RDX:%.*]] = fadd <8 x float> [[TMP17]], [[TMP16]]
-; CHECK-UNORDERED-NEXT: [[BIN_RDX7:%.*]] = fadd <8 x float> [[TMP18]], [[BIN_RDX]]
-; CHECK-UNORDERED-NEXT: [[BIN_RDX8:%.*]] = fadd <8 x float> [[TMP19]], [[BIN_RDX7]]
-; CHECK-UNORDERED-NEXT: [[TMP21:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[BIN_RDX8]])
-; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-UNORDERED: scalar.ph:
-; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED: for.body:
-; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT: [[TMP22:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT: [[ADD]] = fadd float [[TMP22]], [[SUM_07]]
-; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK-UNORDERED: for.end:
-; CHECK-UNORDERED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: ret float [[ADD_LCSSA]]
-;
-; CHECK-ORDERED-LABEL: @fadd_strict_unroll(
-; CHECK-ORDERED-NEXT: entry:
-; CHECK-ORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 32
-; CHECK-ORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-ORDERED: vector.ph:
-; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 32
-; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-ORDERED: vector.body:
-; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8
-; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 16
-; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 24
-; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
-; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]]
-; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP2]]
-; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP3]]
-; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
-; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <8 x float>*
-; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP9]], align 4
-; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 8
-; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to <8 x float>*
-; CHECK-ORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP11]], align 4
-; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 16
-; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <8 x float>*
-; CHECK-ORDERED-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x float>, <8 x float>* [[TMP13]], align 4
-; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 24
-; CHECK-ORDERED-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <8 x float>*
-; CHECK-ORDERED-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x float>, <8 x float>* [[TMP15]], align 4
-; CHECK-ORDERED-NEXT: [[TMP16:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[VEC_PHI]], <8 x float> [[WIDE_LOAD]])
-; CHECK-ORDERED-NEXT: [[TMP17:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[TMP16]], <8 x float> [[WIDE_LOAD1]])
-; CHECK-ORDERED-NEXT: [[TMP18:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[TMP17]], <8 x float> [[WIDE_LOAD2]])
-; CHECK-ORDERED-NEXT: [[TMP19]] = call float @llvm.vector.reduce.fadd.v8f32(float [[TMP18]], <8 x float> [[WIDE_LOAD3]])
-; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; CHECK-ORDERED-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-ORDERED-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK-ORDERED: middle.block:
-; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-ORDERED: scalar.ph:
-; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED: for.body:
-; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-ORDERED-NEXT: [[TMP21:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT: [[ADD]] = fadd float [[TMP21]], [[SUM_07]]
-; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK-ORDERED: for.end:
-; CHECK-ORDERED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT: ret float [[ADD_LCSSA]]
-;
-
-
+; CHECK-ORDERED-LABEL: @fadd_strict_unroll
+; CHECK-ORDERED: vector.body:
+; CHECK-ORDERED: %[[VEC_PHI1:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX4:.*]], %vector.body ]
+; CHECK-ORDERED-NOT: phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX4]], %vector.body ]
+; CHECK-ORDERED: %[[LOAD1:.*]] = load <8 x float>, <8 x float>*
+; CHECK-ORDERED: %[[LOAD2:.*]] = load <8 x float>, <8 x float>*
+; CHECK-ORDERED: %[[LOAD3:.*]] = load <8 x float>, <8 x float>*
+; CHECK-ORDERED: %[[LOAD4:.*]] = load <8 x float>, <8 x float>*
+; CHECK-ORDERED: %[[RDX1:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[VEC_PHI1]], <8 x float> %[[LOAD1]])
+; CHECK-ORDERED: %[[RDX2:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[RDX1]], <8 x float> %[[LOAD2]])
+; CHECK-ORDERED: %[[RDX3:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[RDX2]], <8 x float> %[[LOAD3]])
+; CHECK-ORDERED: %[[RDX4]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[RDX3]], <8 x float> %[[LOAD4]])
+; CHECK-ORDERED: for.end
+; CHECK-ORDERED: %[[PHI:.*]] = phi float [ %[[SCALAR:.*]], %for.body ], [ %[[RDX4]], %middle.block ]
+; CHECK-ORDERED: ret float %[[PHI]]
+
+; CHECK-UNORDERED-LABEL: @fadd_strict_unroll
+; CHECK-UNORDERED: vector.body
+; CHECK-UNORDERED: %[[VEC_PHI1:.*]] = phi <8 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ %[[VEC_FADD1:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[VEC_PHI2:.*]] = phi <8 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ %[[VEC_FADD2:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[VEC_PHI3:.*]] = phi <8 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ %[[VEC_FADD3:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[VEC_PHI4:.*]] = phi <8 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ %[[VEC_FADD4:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[VEC_LOAD1:.*]] = load <8 x float>, <8 x float>*
+; CHECK-UNORDERED: %[[VEC_LOAD2:.*]] = load <8 x float>, <8 x float>*
+; CHECK-UNORDERED: %[[VEC_LOAD3:.*]] = load <8 x float>, <8 x float>*
+; CHECK-UNORDERED: %[[VEC_LOAD4:.*]] = load <8 x float>, <8 x float>*
+; CHECK-UNORDERED: %[[VEC_FADD1]] = fadd <8 x float> %[[VEC_LOAD1]], %[[VEC_PHI1]]
+; CHECK-UNORDERED: %[[VEC_FADD2]] = fadd <8 x float> %[[VEC_LOAD2]], %[[VEC_PHI2]]
+; CHECK-UNORDERED: %[[VEC_FADD3]] = fadd <8 x float> %[[VEC_LOAD3]], %[[VEC_PHI3]]
+; CHECK-UNORDERED: %[[VEC_FADD4]] = fadd <8 x float> %[[VEC_LOAD4]], %[[VEC_PHI4]]
+; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd
+; CHECK-UNORDERED: middle.block
+; CHECK-UNORDERED: %[[BIN_RDX1:.*]] = fadd <8 x float> %[[VEC_FADD2]], %[[VEC_FADD1]]
+; CHECK-UNORDERED: %[[BIN_RDX2:.*]] = fadd <8 x float> %[[VEC_FADD3]], %[[BIN_RDX1]]
+; CHECK-UNORDERED: %[[BIN_RDX3:.*]] = fadd <8 x float> %[[VEC_FADD4]], %[[BIN_RDX2]]
+; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> %[[BIN_RDX3]])
+; CHECK-UNORDERED: for.body
+; CHECK-UNORDERED: %[[LOAD:.*]] = load float, float*
+; CHECK-UNORDERED: %[[FADD:.*]] = fadd float %[[LOAD]], {{.*}}
+; CHECK-UNORDERED: for.end
+; CHECK-UNORDERED: %[[RES:.*]] = phi float [ %[[FADD]], %for.body ], [ %[[RDX]], %middle.block ]
+; CHECK-UNORDERED: ret float %[[RES]]
+
+; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict_unroll
+; CHECK-NOT-VECTORIZED-NOT: vector.body
entry:
br label %for.body
; return sum;
define float @fadd_strict_unroll_last_val(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) {
-; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict_unroll_last_val(
-; CHECK-NOT-VECTORIZED-NEXT: entry:
-; CHECK-NOT-VECTORIZED-NEXT: [[CMP:%.*]] = icmp sgt i64 [[N:%.*]], 0
-; CHECK-NOT-VECTORIZED-NEXT: br i1 [[CMP]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK-NOT-VECTORIZED: for.body.preheader:
-; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED: for.body:
-; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[SUM:%.*]] = phi float [ [[FADD:%.*]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[FADD]] = fadd float [[SUM]], [[TMP0]]
-; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]]
-; CHECK-NOT-VECTORIZED: for.cond.cleanup:
-; CHECK-NOT-VECTORIZED-NEXT: [[FADD_LCSSA:%.*]] = phi float [ [[FADD]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[FADD2:%.*]] = fadd float [[FADD_LCSSA]], 4.200000e+01
-; CHECK-NOT-VECTORIZED-NEXT: store float [[FADD2]], float* [[B:%.*]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_END]]
-; CHECK-NOT-VECTORIZED: for.end:
-; CHECK-NOT-VECTORIZED-NEXT: [[SUM_LCSSA:%.*]] = phi float [ [[FADD_LCSSA]], [[FOR_COND_CLEANUP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-; CHECK-NOT-VECTORIZED-NEXT: ret float [[SUM_LCSSA]]
-;
-; CHECK-UNORDERED-LABEL: @fadd_strict_unroll_last_val(
-; CHECK-UNORDERED-NEXT: entry:
-; CHECK-UNORDERED-NEXT: [[CMP:%.*]] = icmp sgt i64 [[N:%.*]], 0
-; CHECK-UNORDERED-NEXT: br i1 [[CMP]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK-UNORDERED: for.body.preheader:
-; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 32
-; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-UNORDERED: vector.ph:
-; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 32
-; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-UNORDERED: vector.body:
-; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI1:%.*]] = phi <8 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI2:%.*]] = phi <8 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI3:%.*]] = phi <8 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8
-; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 16
-; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 24
-; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
-; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]]
-; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP2]]
-; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP3]]
-; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
-; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP9]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 8
-; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to <8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x float>, <8 x float>* [[TMP11]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 16
-; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x float>, <8 x float>* [[TMP13]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 24
-; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x float>, <8 x float>* [[TMP15]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP16]] = fadd <8 x float> [[VEC_PHI]], [[WIDE_LOAD]]
-; CHECK-UNORDERED-NEXT: [[TMP17]] = fadd <8 x float> [[VEC_PHI1]], [[WIDE_LOAD4]]
-; CHECK-UNORDERED-NEXT: [[TMP18]] = fadd <8 x float> [[VEC_PHI2]], [[WIDE_LOAD5]]
-; CHECK-UNORDERED-NEXT: [[TMP19]] = fadd <8 x float> [[VEC_PHI3]], [[WIDE_LOAD6]]
-; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; CHECK-UNORDERED-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK-UNORDERED: middle.block:
-; CHECK-UNORDERED-NEXT: [[BIN_RDX:%.*]] = fadd <8 x float> [[TMP17]], [[TMP16]]
-; CHECK-UNORDERED-NEXT: [[BIN_RDX7:%.*]] = fadd <8 x float> [[TMP18]], [[BIN_RDX]]
-; CHECK-UNORDERED-NEXT: [[BIN_RDX8:%.*]] = fadd <8 x float> [[TMP19]], [[BIN_RDX7]]
-; CHECK-UNORDERED-NEXT: [[TMP21:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[BIN_RDX8]])
-; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK-UNORDERED: scalar.ph:
-; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED: for.body:
-; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-UNORDERED-NEXT: [[SUM:%.*]] = phi float [ [[FADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT: [[TMP22:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT: [[FADD]] = fadd float [[SUM]], [[TMP22]]
-; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK-UNORDERED: for.cond.cleanup:
-; CHECK-UNORDERED-NEXT: [[FADD_LCSSA:%.*]] = phi float [ [[FADD]], [[FOR_BODY]] ], [ [[TMP21]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: [[FADD2:%.*]] = fadd float [[FADD_LCSSA]], 4.200000e+01
-; CHECK-UNORDERED-NEXT: store float [[FADD2]], float* [[B:%.*]], align 4
-; CHECK-UNORDERED-NEXT: br label [[FOR_END]]
-; CHECK-UNORDERED: for.end:
-; CHECK-UNORDERED-NEXT: [[SUM_LCSSA:%.*]] = phi float [ [[FADD_LCSSA]], [[FOR_COND_CLEANUP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-; CHECK-UNORDERED-NEXT: ret float [[SUM_LCSSA]]
-;
-; CHECK-ORDERED-LABEL: @fadd_strict_unroll_last_val(
-; CHECK-ORDERED-NEXT: entry:
-; CHECK-ORDERED-NEXT: [[CMP:%.*]] = icmp sgt i64 [[N:%.*]], 0
-; CHECK-ORDERED-NEXT: br i1 [[CMP]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK-ORDERED: for.body.preheader:
-; CHECK-ORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 32
-; CHECK-ORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-ORDERED: vector.ph:
-; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 32
-; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-ORDERED: vector.body:
-; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8
-; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 16
-; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 24
-; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
-; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]]
-; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP2]]
-; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP3]]
-; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
-; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <8 x float>*
-; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP9]], align 4
-; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 8
-; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to <8 x float>*
-; CHECK-ORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP11]], align 4
-; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 16
-; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <8 x float>*
-; CHECK-ORDERED-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x float>, <8 x float>* [[TMP13]], align 4
-; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 24
-; CHECK-ORDERED-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <8 x float>*
-; CHECK-ORDERED-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x float>, <8 x float>* [[TMP15]], align 4
-; CHECK-ORDERED-NEXT: [[TMP16:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[VEC_PHI]], <8 x float> [[WIDE_LOAD]])
-; CHECK-ORDERED-NEXT: [[TMP17:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[TMP16]], <8 x float> [[WIDE_LOAD1]])
-; CHECK-ORDERED-NEXT: [[TMP18:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[TMP17]], <8 x float> [[WIDE_LOAD2]])
-; CHECK-ORDERED-NEXT: [[TMP19]] = call float @llvm.vector.reduce.fadd.v8f32(float [[TMP18]], <8 x float> [[WIDE_LOAD3]])
-; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; CHECK-ORDERED-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-ORDERED-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK-ORDERED: middle.block:
-; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK-ORDERED: scalar.ph:
-; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED: for.body:
-; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-ORDERED-NEXT: [[SUM:%.*]] = phi float [ [[FADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-ORDERED-NEXT: [[TMP21:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT: [[FADD]] = fadd float [[SUM]], [[TMP21]]
-; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK-ORDERED: for.cond.cleanup:
-; CHECK-ORDERED-NEXT: [[FADD_LCSSA:%.*]] = phi float [ [[FADD]], [[FOR_BODY]] ], [ [[TMP19]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT: [[FADD2:%.*]] = fadd float [[FADD_LCSSA]], 4.200000e+01
-; CHECK-ORDERED-NEXT: store float [[FADD2]], float* [[B:%.*]], align 4
-; CHECK-ORDERED-NEXT: br label [[FOR_END]]
-; CHECK-ORDERED: for.end:
-; CHECK-ORDERED-NEXT: [[SUM_LCSSA:%.*]] = phi float [ [[FADD_LCSSA]], [[FOR_COND_CLEANUP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-NEXT: ret float [[SUM_LCSSA]]
-;
-
-
+; CHECK-ORDERED-LABEL: @fadd_strict_unroll_last_val
+; CHECK-ORDERED: vector.body
+; CHECK-ORDERED: %[[VEC_PHI1:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX4:.*]], %vector.body ]
+; CHECK-ORDERED-NOT: phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX4]], %vector.body ]
+; CHECK-ORDERED: %[[LOAD1:.*]] = load <8 x float>, <8 x float>*
+; CHECK-ORDERED: %[[LOAD2:.*]] = load <8 x float>, <8 x float>*
+; CHECK-ORDERED: %[[LOAD3:.*]] = load <8 x float>, <8 x float>*
+; CHECK-ORDERED: %[[LOAD4:.*]] = load <8 x float>, <8 x float>*
+; CHECK-ORDERED: %[[RDX1:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[VEC_PHI1]], <8 x float> %[[LOAD1]])
+; CHECK-ORDERED: %[[RDX2:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[RDX1]], <8 x float> %[[LOAD2]])
+; CHECK-ORDERED: %[[RDX3:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[RDX2]], <8 x float> %[[LOAD3]])
+; CHECK-ORDERED: %[[RDX4]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[RDX3]], <8 x float> %[[LOAD4]])
+; CHECK-ORDERED: for.body
+; CHECK-ORDERED: %[[SUM_PHI:.*]] = phi float [ %[[FADD:.*]], %for.body ], [ {{.*}}, %scalar.ph ]
+; CHECK-ORDERED: %[[LOAD5:.*]] = load float, float*
+; CHECK-ORDERED: %[[FADD]] = fadd float %[[SUM_PHI]], %[[LOAD5]]
+; CHECK-ORDERED: for.cond.cleanup
+; CHECK-ORDERED: %[[FADD_LCSSA:.*]] = phi float [ %[[FADD]], %for.body ], [ %[[RDX4]], %middle.block ]
+; CHECK-ORDERED: %[[FADD_42:.*]] = fadd float %[[FADD_LCSSA]], 4.200000e+01
+; CHECK-ORDERED: store float %[[FADD_42]], float* %b
+; CHECK-ORDERED: for.end
+; CHECK-ORDERED: %[[SUM_LCSSA:.*]] = phi float [ %[[FADD_LCSSA]], %for.cond.cleanup ], [ 0.000000e+00, %entry ]
+; CHECK-ORDERED: ret float %[[SUM_LCSSA]]
+
+; CHECK-UNORDERED-LABEL: @fadd_strict_unroll_last_val
+; CHECK-UNORDERED: vector.body
+; CHECK-UNORDERED: %[[VEC_PHI1:.*]] = phi <8 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ %[[VEC_FADD1:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[VEC_PHI2:.*]] = phi <8 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ %[[VEC_FADD2:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[VEC_PHI3:.*]] = phi <8 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ %[[VEC_FADD3:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[VEC_PHI4:.*]] = phi <8 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ %[[VEC_FADD4:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[VEC_LOAD1:.*]] = load <8 x float>, <8 x float>*
+; CHECK-UNORDERED: %[[VEC_LOAD2:.*]] = load <8 x float>, <8 x float>*
+; CHECK-UNORDERED: %[[VEC_LOAD3:.*]] = load <8 x float>, <8 x float>*
+; CHECK-UNORDERED: %[[VEC_LOAD4:.*]] = load <8 x float>, <8 x float>*
+; CHECK-UNORDERED: %[[VEC_FADD1]] = fadd <8 x float> %[[VEC_PHI1]], %[[VEC_LOAD1]]
+; CHECK-UNORDERED: %[[VEC_FADD2]] = fadd <8 x float> %[[VEC_PHI2]], %[[VEC_LOAD2]]
+; CHECK-UNORDERED: %[[VEC_FADD3]] = fadd <8 x float> %[[VEC_PHI3]], %[[VEC_LOAD3]]
+; CHECK-UNORDERED: %[[VEC_FADD4]] = fadd <8 x float> %[[VEC_PHI4]], %[[VEC_LOAD4]]
+; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd
+; CHECK-UNORDERED: middle.block
+; CHECK-UNORDERED: %[[BIN_RDX1:.*]] = fadd <8 x float> %[[VEC_FADD2]], %[[VEC_FADD1]]
+; CHECK-UNORDERED: %[[BIN_RDX2:.*]] = fadd <8 x float> %[[VEC_FADD3]], %[[BIN_RDX1]]
+; CHECK-UNORDERED: %[[BIN_RDX3:.*]] = fadd <8 x float> %[[VEC_FADD4]], %[[BIN_RDX2]]
+; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> %[[BIN_RDX3]])
+; CHECK-UNORDERED: for.body
+; CHECK-UNORDERED: %[[LOAD:.*]] = load float, float*
+; CHECK-UNORDERED: %[[FADD:.*]] = fadd float {{.*}}, %[[LOAD]]
+; CHECK-UNORDERED: for.cond.cleanup
+; CHECK-UNORDERED: %[[FADD_LCSSA:.*]] = phi float [ %[[FADD]], %for.body ], [ %[[RDX]], %middle.block ]
+; CHECK-UNORDERED: %[[FADD_42:.*]] = fadd float %[[FADD_LCSSA]], 4.200000e+01
+; CHECK-UNORDERED: store float %[[FADD_42]], float* %b
+; CHECK-UNORDERED: for.end
+; CHECK-UNORDERED: %[[SUM_LCSSA:.*]] = phi float [ %[[FADD_LCSSA]], %for.cond.cleanup ], [ 0.000000e+00, %entry ]
+; CHECK-UNORDERED: ret float %[[SUM_LCSSA]]
+
+; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict_unroll_last_val
+; CHECK-NOT-VECTORIZED-NOT: vector.body
entry:
%cmp = icmp sgt i64 %n, 0
}
define void @fadd_strict_interleave(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) {
-; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict_interleave(
-; CHECK-NOT-VECTORIZED-NEXT: entry:
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDXA:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 1
-; CHECK-NOT-VECTORIZED-NEXT: [[A1:%.*]] = load float, float* [[A]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[A2:%.*]] = load float, float* [[ARRAYIDXA]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED: for.body:
-; CHECK-NOT-VECTORIZED-NEXT: [[ADD_PHI1:%.*]] = phi float [ [[A2]], [[ENTRY:%.*]] ], [ [[ADD2:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[ADD_PHI2:%.*]] = phi float [ [[A1]], [[ENTRY]] ], [ [[ADD1:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDXB1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDXB1]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[ADD1]] = fadd float [[TMP0]], [[ADD_PHI2]]
-; CHECK-NOT-VECTORIZED-NEXT: [[OR:%.*]] = or i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDXB2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[OR]]
-; CHECK-NOT-VECTORIZED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDXB2]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[ADD2]] = fadd float [[TMP1]], [[ADD_PHI1]]
-; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 2
-; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK-NOT-VECTORIZED: for.end:
-; CHECK-NOT-VECTORIZED-NEXT: [[ADD1_LCSSA:%.*]] = phi float [ [[ADD1]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[ADD2_LCSSA:%.*]] = phi float [ [[ADD2]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: store float [[ADD1_LCSSA]], float* [[A]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: store float [[ADD2_LCSSA]], float* [[ARRAYIDXA]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: ret void
-;
-; CHECK-UNORDERED-LABEL: @fadd_strict_interleave(
-; CHECK-UNORDERED-NEXT: entry:
-; CHECK-UNORDERED-NEXT: [[ARRAYIDXA:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 1
-; CHECK-UNORDERED-NEXT: [[A1:%.*]] = load float, float* [[A]], align 4
-; CHECK-UNORDERED-NEXT: [[A2:%.*]] = load float, float* [[ARRAYIDXA]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], -2
-; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 1
-; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = add nuw i64 [[TMP1]], 1
-; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4
-; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-UNORDERED: vector.ph:
-; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4
-; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 2
-; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = insertelement <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, float [[A2]], i32 0
-; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = insertelement <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, float [[A1]], i32 0
-; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-UNORDERED: vector.body:
-; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ [[TMP3]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x float> [ [[TMP4]], [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
-; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP5]]
-; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 0
-; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP7]] to <8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_VEC:%.*]] = load <8 x float>, <8 x float>* [[TMP8]], align 4
-; CHECK-UNORDERED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x float> [[WIDE_VEC]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-UNORDERED-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x float> [[WIDE_VEC]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-UNORDERED-NEXT: [[TMP9]] = fadd <4 x float> [[STRIDED_VEC]], [[VEC_PHI1]]
-; CHECK-UNORDERED-NEXT: [[TMP10]] = fadd <4 x float> [[STRIDED_VEC2]], [[VEC_PHI]]
-; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK-UNORDERED: middle.block:
-; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP9]])
-; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP10]])
-; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-UNORDERED: scalar.ph:
-; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[A2]], [[ENTRY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX3:%.*]] = phi float [ [[A1]], [[ENTRY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED: for.body:
-; CHECK-UNORDERED-NEXT: [[ADD_PHI1:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD2:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[ADD_PHI2:%.*]] = phi float [ [[BC_MERGE_RDX3]], [[SCALAR_PH]] ], [ [[ADD1:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[ARRAYIDXB1:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = load float, float* [[ARRAYIDXB1]], align 4
-; CHECK-UNORDERED-NEXT: [[ADD1]] = fadd float [[TMP14]], [[ADD_PHI2]]
-; CHECK-UNORDERED-NEXT: [[OR:%.*]] = or i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT: [[ARRAYIDXB2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[OR]]
-; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = load float, float* [[ARRAYIDXB2]], align 4
-; CHECK-UNORDERED-NEXT: [[ADD2]] = fadd float [[TMP15]], [[ADD_PHI1]]
-; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 2
-; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK-UNORDERED: for.end:
-; CHECK-UNORDERED-NEXT: [[ADD1_LCSSA:%.*]] = phi float [ [[ADD1]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: [[ADD2_LCSSA:%.*]] = phi float [ [[ADD2]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: store float [[ADD1_LCSSA]], float* [[A]], align 4
-; CHECK-UNORDERED-NEXT: store float [[ADD2_LCSSA]], float* [[ARRAYIDXA]], align 4
-; CHECK-UNORDERED-NEXT: ret void
-;
-; CHECK-ORDERED-LABEL: @fadd_strict_interleave(
-; CHECK-ORDERED-NEXT: entry:
-; CHECK-ORDERED-NEXT: [[ARRAYIDXA:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 1
-; CHECK-ORDERED-NEXT: [[A1:%.*]] = load float, float* [[A]], align 4
-; CHECK-ORDERED-NEXT: [[A2:%.*]] = load float, float* [[ARRAYIDXA]], align 4
-; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], -2
-; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 1
-; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = add nuw i64 [[TMP1]], 1
-; CHECK-ORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4
-; CHECK-ORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-ORDERED: vector.ph:
-; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4
-; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
-; CHECK-ORDERED-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 2
-; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-ORDERED: vector.body:
-; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ [[A2]], [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[VEC_PHI1:%.*]] = phi float [ [[A1]], [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
-; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP3]]
-; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
-; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP5]] to <8 x float>*
-; CHECK-ORDERED-NEXT: [[WIDE_VEC:%.*]] = load <8 x float>, <8 x float>* [[TMP6]], align 4
-; CHECK-ORDERED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x float> [[WIDE_VEC]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-ORDERED-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x float> [[WIDE_VEC]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-ORDERED-NEXT: [[TMP7]] = call float @llvm.vector.reduce.fadd.v4f32(float [[VEC_PHI1]], <4 x float> [[STRIDED_VEC]])
-; CHECK-ORDERED-NEXT: [[TMP8]] = call float @llvm.vector.reduce.fadd.v4f32(float [[VEC_PHI]], <4 x float> [[STRIDED_VEC2]])
-; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-ORDERED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK-ORDERED: middle.block:
-; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
-; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-ORDERED: scalar.ph:
-; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[A2]], [[ENTRY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX3:%.*]] = phi float [ [[A1]], [[ENTRY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED: for.body:
-; CHECK-ORDERED-NEXT: [[ADD_PHI1:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD2:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[ADD_PHI2:%.*]] = phi float [ [[BC_MERGE_RDX3]], [[SCALAR_PH]] ], [ [[ADD1:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[ARRAYIDXB1:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]]
-; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = load float, float* [[ARRAYIDXB1]], align 4
-; CHECK-ORDERED-NEXT: [[ADD1]] = fadd float [[TMP10]], [[ADD_PHI2]]
-; CHECK-ORDERED-NEXT: [[OR:%.*]] = or i64 [[IV]], 1
-; CHECK-ORDERED-NEXT: [[ARRAYIDXB2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[OR]]
-; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = load float, float* [[ARRAYIDXB2]], align 4
-; CHECK-ORDERED-NEXT: [[ADD2]] = fadd float [[TMP11]], [[ADD_PHI1]]
-; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 2
-; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK-ORDERED: for.end:
-; CHECK-ORDERED-NEXT: [[ADD1_LCSSA:%.*]] = phi float [ [[ADD1]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT: [[ADD2_LCSSA:%.*]] = phi float [ [[ADD2]], [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT: store float [[ADD1_LCSSA]], float* [[A]], align 4
-; CHECK-ORDERED-NEXT: store float [[ADD2_LCSSA]], float* [[ARRAYIDXA]], align 4
-; CHECK-ORDERED-NEXT: ret void
-;
-
-
+; CHECK-ORDERED-LABEL: @fadd_strict_interleave
+; CHECK-ORDERED: entry
+; CHECK-ORDERED: %[[ARRAYIDX:.*]] = getelementptr inbounds float, float* %a, i64 1
+; CHECK-ORDERED: %[[LOAD1:.*]] = load float, float* %a
+; CHECK-ORDERED: %[[LOAD2:.*]] = load float, float* %[[ARRAYIDX]]
+; CHECK-ORDERED: vector.body
+; CHECK-ORDERED: %[[VEC_PHI1:.*]] = phi float [ %[[LOAD2]], %vector.ph ], [ %[[RDX2:.*]], %vector.body ]
+; CHECK-ORDERED: %[[VEC_PHI2:.*]] = phi float [ %[[LOAD1]], %vector.ph ], [ %[[RDX1:.*]], %vector.body ]
+; CHECK-ORDERED: %[[WIDE_LOAD:.*]] = load <8 x float>, <8 x float>*
+; CHECK-ORDERED: %[[STRIDED1:.*]] = shufflevector <8 x float> %[[WIDE_LOAD]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-ORDERED: %[[STRIDED2:.*]] = shufflevector <8 x float> %[[WIDE_LOAD]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-ORDERED: %[[RDX1]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[VEC_PHI2]], <4 x float> %[[STRIDED1]])
+; CHECK-ORDERED: %[[RDX2]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[VEC_PHI1]], <4 x float> %[[STRIDED2]])
+; CHECK-ORDERED: for.end
+; CHECK-ORDERED: ret void
+
+; CHECK-UNORDERED-LABEL: @fadd_strict_interleave
+; CHECK-UNORDERED: %[[ARRAYIDX:.*]] = getelementptr inbounds float, float* %a, i64 1
+; CHECK-UNORDERED: %[[LOADA1:.*]] = load float, float* %a
+; CHECK-UNORDERED: %[[LOADA2:.*]] = load float, float* %[[ARRAYIDX]]
+; CHECK-UNORDERED: vector.ph
+; CHECK-UNORDERED: %[[INS2:.*]] = insertelement <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, float %[[LOADA2]], i32 0
+; CHECK-UNORDERED: %[[INS1:.*]] = insertelement <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, float %[[LOADA1]], i32 0
+; CHECK-UNORDERED: vector.body
+; CHECK-UNORDERED: %[[VEC_PHI2:.*]] = phi <4 x float> [ %[[INS2]], %vector.ph ], [ %[[VEC_FADD2:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[VEC_PHI1:.*]] = phi <4 x float> [ %[[INS1]], %vector.ph ], [ %[[VEC_FADD1:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[WIDE_LOAD:.*]] = load <8 x float>, <8 x float>*
+; CHECK-UNORDERED: %[[STRIDED1:.*]] = shufflevector <8 x float> %[[WIDE_LOAD]], <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-UNORDERED: %[[STRIDED2:.*]] = shufflevector <8 x float> %[[WIDE_LOAD]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-UNORDERED: %[[VEC_FADD1]] = fadd <4 x float> %[[STRIDED1:.*]], %[[VEC_PHI1]]
+; CHECK-UNORDERED: %[[VEC_FADD2]] = fadd <4 x float> %[[STRIDED2:.*]], %[[VEC_PHI2]]
+; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd
+; CHECK-UNORDERED: middle.block
+; CHECK-UNORDERED: %[[RDX1:.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> %[[VEC_FADD1]])
+; CHECK-UNORDERED: %[[RDX2:.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> %[[VEC_FADD2]])
+; CHECK-UNORDERED: for.body
+; CHECK-UNORDERED: %[[LOAD1:.*]] = load float, float*
+; CHECK-UNORDERED: %[[FADD1:.*]] = fadd float %[[LOAD1]], {{.*}}
+; CHECK-UNORDERED: %[[LOAD2:.*]] = load float, float*
+; CHECK-UNORDERED: %[[FADD2:.*]] = fadd float %[[LOAD2]], {{.*}}
+; CHECK-UNORDERED: for.end
+; CHECK-UNORDERED: %[[SUM1:.*]] = phi float [ %[[FADD1]], %for.body ], [ %[[RDX1]], %middle.block ]
+; CHECK-UNORDERED: %[[SUM2:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[RDX2]], %middle.block ]
+; CHECK-UNORDERED: store float %[[SUM1]]
+; CHECK-UNORDERED: store float %[[SUM2]]
+; CHECK-UNORDERED: ret void
+
+; CHECK-NOT-VECTORIZED-LABEL: @fadd_strict_interleave
+; CHECK-NOT-VECTORIZED-NOT: vector.body
entry:
%arrayidxa = getelementptr inbounds float, float* %a, i64 1
}
define float @fadd_of_sum(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) {
-; CHECK-NOT-VECTORIZED-LABEL: @fadd_of_sum(
-; CHECK-NOT-VECTORIZED-NEXT: entry:
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 1
-; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP0]], 5.000000e-01
-; CHECK-NOT-VECTORIZED-NEXT: br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK-NOT-VECTORIZED: for.body.preheader:
-; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED: for.body:
-; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[RES_014:%.*]] = phi float [ [[RDX:%.*]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX4]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[ADD:%.*]] = fadd float [[TMP1]], [[TMP2]]
-; CHECK-NOT-VECTORIZED-NEXT: [[RDX]] = fadd float [[RES_014]], [[ADD]]
-; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP6]]
-; CHECK-NOT-VECTORIZED: for.end.loopexit:
-; CHECK-NOT-VECTORIZED-NEXT: [[RDX_LCSSA:%.*]] = phi float [ [[RDX]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_END]]
-; CHECK-NOT-VECTORIZED: for.end:
-; CHECK-NOT-VECTORIZED-NEXT: [[RES:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[RDX_LCSSA]], [[FOR_END_LOOPEXIT]] ]
-; CHECK-NOT-VECTORIZED-NEXT: ret float [[RES]]
-;
-; CHECK-UNORDERED-LABEL: @fadd_of_sum(
-; CHECK-UNORDERED-NEXT: entry:
-; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 1
-; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP0]], 5.000000e-01
-; CHECK-UNORDERED-NEXT: br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK-UNORDERED: for.body.preheader:
-; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-UNORDERED: vector.ph:
-; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-UNORDERED: vector.body:
-; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0
-; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]]
-; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = bitcast float* [[TMP3]] to <4 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP1]]
-; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0
-; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = fadd <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; CHECK-UNORDERED-NEXT: [[TMP9]] = fadd <4 x float> [[VEC_PHI]], [[TMP8]]
-; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK-UNORDERED: middle.block:
-; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP9]])
-; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-UNORDERED: scalar.ph:
-; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED: for.body:
-; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-UNORDERED-NEXT: [[RES_014:%.*]] = phi float [ [[RDX:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-UNORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-UNORDERED-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = load float, float* [[ARRAYIDX4]], align 4
-; CHECK-UNORDERED-NEXT: [[ADD:%.*]] = fadd float [[TMP12]], [[TMP13]]
-; CHECK-UNORDERED-NEXT: [[RDX]] = fadd float [[RES_014]], [[ADD]]
-; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK-UNORDERED: for.end.loopexit:
-; CHECK-UNORDERED-NEXT: [[RDX_LCSSA:%.*]] = phi float [ [[RDX]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: br label [[FOR_END]]
-; CHECK-UNORDERED: for.end:
-; CHECK-UNORDERED-NEXT: [[RES:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[RDX_LCSSA]], [[FOR_END_LOOPEXIT]] ]
-; CHECK-UNORDERED-NEXT: ret float [[RES]]
-;
-; CHECK-ORDERED-LABEL: @fadd_of_sum(
-; CHECK-ORDERED-NEXT: entry:
-; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 1
-; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP0]], 5.000000e-01
-; CHECK-ORDERED-NEXT: br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK-ORDERED: for.body.preheader:
-; CHECK-ORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-ORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-ORDERED: vector.ph:
-; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-ORDERED: vector.body:
-; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0
-; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]]
-; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = bitcast float* [[TMP3]] to <4 x float>*
-; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4
-; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP1]]
-; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0
-; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-ORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
-; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = fadd <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; CHECK-ORDERED-NEXT: [[TMP9]] = call float @llvm.vector.reduce.fadd.v4f32(float [[VEC_PHI]], <4 x float> [[TMP8]])
-; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-ORDERED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK-ORDERED: middle.block:
-; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-ORDERED: scalar.ph:
-; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED: for.body:
-; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-ORDERED-NEXT: [[RES_014:%.*]] = phi float [ [[RDX:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-ORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-ORDERED-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]]
-; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = load float, float* [[ARRAYIDX4]], align 4
-; CHECK-ORDERED-NEXT: [[ADD:%.*]] = fadd float [[TMP11]], [[TMP12]]
-; CHECK-ORDERED-NEXT: [[RDX]] = fadd float [[RES_014]], [[ADD]]
-; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK-ORDERED: for.end.loopexit:
-; CHECK-ORDERED-NEXT: [[RDX_LCSSA:%.*]] = phi float [ [[RDX]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT: br label [[FOR_END]]
-; CHECK-ORDERED: for.end:
-; CHECK-ORDERED-NEXT: [[RES:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[RDX_LCSSA]], [[FOR_END_LOOPEXIT]] ]
-; CHECK-ORDERED-NEXT: ret float [[RES]]
-;
-
-
+; CHECK-ORDERED-LABEL: @fadd_of_sum
+; CHECK-ORDERED: vector.body
+; CHECK-ORDERED: %[[VEC_PHI1:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX:.*]], %vector.body ]
+; CHECK-ORDERED: %[[LOAD1:.*]] = load <4 x float>, <4 x float>*
+; CHECK-ORDERED: %[[LOAD2:.*]] = load <4 x float>, <4 x float>*
+; CHECK-ORDERED: %[[ADD:.*]] = fadd <4 x float> %[[LOAD1]], %[[LOAD2]]
+; CHECK-ORDERED: %[[RDX]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[VEC_PHI1]], <4 x float> %[[ADD]])
+; CHECK-ORDERED: for.end.loopexit
+; CHECK-ORDERED: %[[EXIT_PHI:.*]] = phi float [ %[[SCALAR:.*]], %for.body ], [ %[[RDX]], %middle.block ]
+; CHECK-ORDERED: for.end
+; CHECK-ORDERED: %[[PHI:.*]] = phi float [ 0.000000e+00, %entry ], [ %[[EXIT_PHI]], %for.end.loopexit ]
+; CHECK-ORDERED: ret float %[[PHI]]
+
+; CHECK-UNORDERED-LABEL: @fadd_of_sum
+; CHECK-UNORDERED: vector.body
+; CHECK-UNORDERED: %[[VEC_PHI:.*]] = phi <4 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ %[[VEC_FADD2:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[VEC_LOAD1:.*]] = load <4 x float>, <4 x float>*
+; CHECK-UNORDERED: %[[VEC_LOAD2:.*]] = load <4 x float>, <4 x float>*
+; CHECK-UNORDERED: %[[VEC_FADD1:.*]] = fadd <4 x float> %[[VEC_LOAD1]], %[[VEC_LOAD2]]
+; CHECK-UNORDERED: %[[VEC_FADD2]] = fadd <4 x float> %[[VEC_PHI]], %[[VEC_FADD1]]
+; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd
+; CHECK-UNORDERED: middle.block
+; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> %[[VEC_FADD2]])
+; CHECK-UNORDERED: for.body
+; CHECK-UNORDERED: %[[LOAD1:.*]] = load float, float*
+; CHECK-UNORDERED: %[[LOAD2:.*]] = load float, float*
+; CHECK-UNORDERED: %[[FADD1:.*]] = fadd float %[[LOAD1]], %[[LOAD2]]
+; CHECK-UNORDERED: %[[FADD2:.*]] = fadd float {{.*}}, %[[FADD1]]
+; CHECK-UNORDERED: for.end.loopexit
+; CHECK-UNORDERED: %[[EXIT:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[RDX]], %middle.block ]
+; CHECK-UNORDERED: for.end
+; CHECK-UNORDERED: %[[SUM:.*]] = phi float [ 0.000000e+00, %entry ], [ %[[EXIT]], %for.end.loopexit ]
+; CHECK-UNORDERED: ret float %[[SUM]]
+
+; CHECK-NOT-VECTORIZED-LABEL: @fadd_of_sum
+; CHECK-NOT-VECTORIZED-NOT: vector.body
entry:
%arrayidx = getelementptr inbounds float, float* %a, i64 1
}
define float @fadd_conditional(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) {
-; CHECK-NOT-VECTORIZED-LABEL: @fadd_conditional(
-; CHECK-NOT-VECTORIZED-NEXT: entry:
-; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED: for.body:
-; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[RES:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[FADD:%.*]], [[FOR_INC]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[TOBOOL:%.*]] = fcmp une float [[TMP0]], 0.000000e+00
-; CHECK-NOT-VECTORIZED-NEXT: br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]]
-; CHECK-NOT-VECTORIZED: if.then:
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_INC]]
-; CHECK-NOT-VECTORIZED: for.inc:
-; CHECK-NOT-VECTORIZED-NEXT: [[PHI:%.*]] = phi float [ [[TMP1]], [[IF_THEN]] ], [ 3.000000e+00, [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[FADD]] = fadd float [[RES]], [[PHI]]
-; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP6]]
-; CHECK-NOT-VECTORIZED: for.end:
-; CHECK-NOT-VECTORIZED-NEXT: [[RDX:%.*]] = phi float [ [[FADD]], [[FOR_INC]] ]
-; CHECK-NOT-VECTORIZED-NEXT: ret float [[RDX]]
-;
-; CHECK-UNORDERED-LABEL: @fadd_conditional(
-; CHECK-UNORDERED-NEXT: entry:
-; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-UNORDERED: vector.ph:
-; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-UNORDERED: vector.body:
-; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ <float 1.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP29:%.*]], [[PRED_LOAD_CONTINUE6]] ]
-; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
-; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
-; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = fcmp une <4 x float> [[WIDE_LOAD]], zeroinitializer
-; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0
-; CHECK-UNORDERED-NEXT: br i1 [[TMP5]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; CHECK-UNORDERED: pred.load.if:
-; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
-; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = load float, float* [[TMP6]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = insertelement <4 x float> poison, float [[TMP7]], i32 0
-; CHECK-UNORDERED-NEXT: br label [[PRED_LOAD_CONTINUE]]
-; CHECK-UNORDERED: pred.load.continue:
-; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = phi <4 x float> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_LOAD_IF]] ]
-; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP4]], i32 1
-; CHECK-UNORDERED-NEXT: br i1 [[TMP10]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]]
-; CHECK-UNORDERED: pred.load.if1:
-; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 1
-; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP11]]
-; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = load float, float* [[TMP12]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP13]], i32 1
-; CHECK-UNORDERED-NEXT: br label [[PRED_LOAD_CONTINUE2]]
-; CHECK-UNORDERED: pred.load.continue2:
-; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = phi <4 x float> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF1]] ]
-; CHECK-UNORDERED-NEXT: [[TMP16:%.*]] = extractelement <4 x i1> [[TMP4]], i32 2
-; CHECK-UNORDERED-NEXT: br i1 [[TMP16]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
-; CHECK-UNORDERED: pred.load.if3:
-; CHECK-UNORDERED-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 2
-; CHECK-UNORDERED-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP17]]
-; CHECK-UNORDERED-NEXT: [[TMP19:%.*]] = load float, float* [[TMP18]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP20:%.*]] = insertelement <4 x float> [[TMP15]], float [[TMP19]], i32 2
-; CHECK-UNORDERED-NEXT: br label [[PRED_LOAD_CONTINUE4]]
-; CHECK-UNORDERED: pred.load.continue4:
-; CHECK-UNORDERED-NEXT: [[TMP21:%.*]] = phi <4 x float> [ [[TMP15]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP20]], [[PRED_LOAD_IF3]] ]
-; CHECK-UNORDERED-NEXT: [[TMP22:%.*]] = extractelement <4 x i1> [[TMP4]], i32 3
-; CHECK-UNORDERED-NEXT: br i1 [[TMP22]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6]]
-; CHECK-UNORDERED: pred.load.if5:
-; CHECK-UNORDERED-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], 3
-; CHECK-UNORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP23]]
-; CHECK-UNORDERED-NEXT: [[TMP25:%.*]] = load float, float* [[TMP24]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP26:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP25]], i32 3
-; CHECK-UNORDERED-NEXT: br label [[PRED_LOAD_CONTINUE6]]
-; CHECK-UNORDERED: pred.load.continue6:
-; CHECK-UNORDERED-NEXT: [[TMP27:%.*]] = phi <4 x float> [ [[TMP21]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP26]], [[PRED_LOAD_IF5]] ]
-; CHECK-UNORDERED-NEXT: [[TMP28:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-UNORDERED-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP28]], <4 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>, <4 x float> [[TMP27]]
-; CHECK-UNORDERED-NEXT: [[TMP29]] = fadd <4 x float> [[VEC_PHI]], [[PREDPHI]]
-; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-UNORDERED-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-; CHECK-UNORDERED: middle.block:
-; CHECK-UNORDERED-NEXT: [[TMP31:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP29]])
-; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-UNORDERED: scalar.ph:
-; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[TMP31]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED: for.body:
-; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; CHECK-UNORDERED-NEXT: [[RES:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[FADD:%.*]], [[FOR_INC]] ]
-; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT: [[TMP32:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT: [[TOBOOL:%.*]] = fcmp une float [[TMP32]], 0.000000e+00
-; CHECK-UNORDERED-NEXT: br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]]
-; CHECK-UNORDERED: if.then:
-; CHECK-UNORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT: [[TMP33:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-UNORDERED-NEXT: br label [[FOR_INC]]
-; CHECK-UNORDERED: for.inc:
-; CHECK-UNORDERED-NEXT: [[PHI:%.*]] = phi float [ [[TMP33]], [[IF_THEN]] ], [ 3.000000e+00, [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[FADD]] = fadd float [[RES]], [[PHI]]
-; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
-; CHECK-UNORDERED: for.end:
-; CHECK-UNORDERED-NEXT: [[RDX:%.*]] = phi float [ [[FADD]], [[FOR_INC]] ], [ [[TMP31]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: ret float [[RDX]]
-;
-; CHECK-ORDERED-LABEL: @fadd_conditional(
-; CHECK-ORDERED-NEXT: entry:
-; CHECK-ORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-ORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-ORDERED: vector.ph:
-; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-ORDERED: vector.body:
-; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
-; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 1.000000e+00, [[VECTOR_PH]] ], [ [[TMP29:%.*]], [[PRED_LOAD_CONTINUE6]] ]
-; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
-; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
-; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>*
-; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
-; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = fcmp une <4 x float> [[WIDE_LOAD]], zeroinitializer
-; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0
-; CHECK-ORDERED-NEXT: br i1 [[TMP5]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; CHECK-ORDERED: pred.load.if:
-; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
-; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = load float, float* [[TMP6]], align 4
-; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = insertelement <4 x float> poison, float [[TMP7]], i32 0
-; CHECK-ORDERED-NEXT: br label [[PRED_LOAD_CONTINUE]]
-; CHECK-ORDERED: pred.load.continue:
-; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = phi <4 x float> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_LOAD_IF]] ]
-; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP4]], i32 1
-; CHECK-ORDERED-NEXT: br i1 [[TMP10]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]]
-; CHECK-ORDERED: pred.load.if1:
-; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 1
-; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP11]]
-; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = load float, float* [[TMP12]], align 4
-; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = insertelement <4 x float> [[TMP9]], float [[TMP13]], i32 1
-; CHECK-ORDERED-NEXT: br label [[PRED_LOAD_CONTINUE2]]
-; CHECK-ORDERED: pred.load.continue2:
-; CHECK-ORDERED-NEXT: [[TMP15:%.*]] = phi <4 x float> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF1]] ]
-; CHECK-ORDERED-NEXT: [[TMP16:%.*]] = extractelement <4 x i1> [[TMP4]], i32 2
-; CHECK-ORDERED-NEXT: br i1 [[TMP16]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
-; CHECK-ORDERED: pred.load.if3:
-; CHECK-ORDERED-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 2
-; CHECK-ORDERED-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP17]]
-; CHECK-ORDERED-NEXT: [[TMP19:%.*]] = load float, float* [[TMP18]], align 4
-; CHECK-ORDERED-NEXT: [[TMP20:%.*]] = insertelement <4 x float> [[TMP15]], float [[TMP19]], i32 2
-; CHECK-ORDERED-NEXT: br label [[PRED_LOAD_CONTINUE4]]
-; CHECK-ORDERED: pred.load.continue4:
-; CHECK-ORDERED-NEXT: [[TMP21:%.*]] = phi <4 x float> [ [[TMP15]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP20]], [[PRED_LOAD_IF3]] ]
-; CHECK-ORDERED-NEXT: [[TMP22:%.*]] = extractelement <4 x i1> [[TMP4]], i32 3
-; CHECK-ORDERED-NEXT: br i1 [[TMP22]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6]]
-; CHECK-ORDERED: pred.load.if5:
-; CHECK-ORDERED-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], 3
-; CHECK-ORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP23]]
-; CHECK-ORDERED-NEXT: [[TMP25:%.*]] = load float, float* [[TMP24]], align 4
-; CHECK-ORDERED-NEXT: [[TMP26:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP25]], i32 3
-; CHECK-ORDERED-NEXT: br label [[PRED_LOAD_CONTINUE6]]
-; CHECK-ORDERED: pred.load.continue6:
-; CHECK-ORDERED-NEXT: [[TMP27:%.*]] = phi <4 x float> [ [[TMP21]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP26]], [[PRED_LOAD_IF5]] ]
-; CHECK-ORDERED-NEXT: [[TMP28:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-ORDERED-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP28]], <4 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>, <4 x float> [[TMP27]]
-; CHECK-ORDERED-NEXT: [[TMP29]] = call float @llvm.vector.reduce.fadd.v4f32(float [[VEC_PHI]], <4 x float> [[PREDPHI]])
-; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-ORDERED-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-ORDERED-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-; CHECK-ORDERED: middle.block:
-; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-ORDERED: scalar.ph:
-; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 1.000000e+00, [[ENTRY]] ], [ [[TMP29]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED: for.body:
-; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; CHECK-ORDERED-NEXT: [[RES:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[FADD:%.*]], [[FOR_INC]] ]
-; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]]
-; CHECK-ORDERED-NEXT: [[TMP31:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT: [[TOBOOL:%.*]] = fcmp une float [[TMP31]], 0.000000e+00
-; CHECK-ORDERED-NEXT: br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]]
-; CHECK-ORDERED: if.then:
-; CHECK-ORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-ORDERED-NEXT: [[TMP32:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-ORDERED-NEXT: br label [[FOR_INC]]
-; CHECK-ORDERED: for.inc:
-; CHECK-ORDERED-NEXT: [[PHI:%.*]] = phi float [ [[TMP32]], [[IF_THEN]] ], [ 3.000000e+00, [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[FADD]] = fadd float [[RES]], [[PHI]]
-; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
-; CHECK-ORDERED: for.end:
-; CHECK-ORDERED-NEXT: [[RDX:%.*]] = phi float [ [[FADD]], [[FOR_INC]] ], [ [[TMP29]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT: ret float [[RDX]]
-;
-
-
+; CHECK-ORDERED-LABEL: @fadd_conditional
+; CHECK-ORDERED: vector.body:
+; CHECK-ORDERED: %[[PHI:.*]] = phi float [ 1.000000e+00, %vector.ph ], [ %[[RDX:.*]], %pred.load.continue6 ]
+; CHECK-ORDERED: %[[LOAD1:.*]] = load <4 x float>, <4 x float>*
+; CHECK-ORDERED: %[[FCMP1:.*]] = fcmp une <4 x float> %[[LOAD1]], zeroinitializer
+; CHECK-ORDERED: %[[EXTRACT:.*]] = extractelement <4 x i1> %[[FCMP1]], i32 0
+; CHECK-ORDERED: br i1 %[[EXTRACT]], label %pred.load.if, label %pred.load.continue
+; CHECK-ORDERED: pred.load.continue6
+; CHECK-ORDERED: %[[PHI1:.*]] = phi <4 x float> [ %[[PHI0:.*]], %pred.load.continue4 ], [ %[[INS_ELT:.*]], %pred.load.if5 ]
+; CHECK-ORDERED: %[[XOR:.*]] = xor <4 x i1> %[[FCMP1]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-ORDERED: %[[PRED:.*]] = select <4 x i1> %[[XOR]], <4 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>, <4 x float> %[[PHI1]]
+; CHECK-ORDERED: %[[RDX]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[PHI]], <4 x float> %[[PRED]])
+; CHECK-ORDERED: for.body
+; CHECK-ORDERED: %[[RES_PHI:.*]] = phi float [ %[[MERGE_RDX:.*]], %scalar.ph ], [ %[[FADD:.*]], %for.inc ]
+; CHECK-ORDERED: %[[LOAD2:.*]] = load float, float*
+; CHECK-ORDERED: %[[FCMP2:.*]] = fcmp une float %[[LOAD2]], 0.000000e+00
+; CHECK-ORDERED: br i1 %[[FCMP2]], label %if.then, label %for.inc
+; CHECK-ORDERED: if.then
+; CHECK-ORDERED: %[[LOAD3:.*]] = load float, float*
+; CHECK-ORDERED: br label %for.inc
+; CHECK-ORDERED: for.inc
+; CHECK-ORDERED: %[[PHI2:.*]] = phi float [ %[[LOAD3]], %if.then ], [ 3.000000e+00, %for.body ]
+; CHECK-ORDERED: %[[FADD]] = fadd float %[[RES_PHI]], %[[PHI2]]
+; CHECK-ORDERED: for.end
+; CHECK-ORDERED: %[[RDX_PHI:.*]] = phi float [ %[[FADD]], %for.inc ], [ %[[RDX]], %middle.block ]
+; CHECK-ORDERED: ret float %[[RDX_PHI]]
+
+; CHECK-UNORDERED-LABEL: @fadd_conditional
+; CHECK-UNORDERED: vector.body
+; CHECK-UNORDERED: %[[PHI:.*]] = phi <4 x float> [ <float 1.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ %[[VEC_FADD:.*]], %pred.load.continue6 ]
+; CHECK-UNORDERED: %[[LOAD1:.*]] = load <4 x float>, <4 x float>*
+; CHECK-UNORDERED: %[[FCMP1:.*]] = fcmp une <4 x float> %[[LOAD1]], zeroinitializer
+; CHECK-UNORDERED: %[[EXTRACT:.*]] = extractelement <4 x i1> %[[FCMP1]], i32 0
+; CHECK-UNORDERED: br i1 %[[EXTRACT]], label %pred.load.if, label %pred.load.continue
+; CHECK-UNORDERED: pred.load.continue6
+; CHECK-UNORDERED: %[[XOR:.*]] = xor <4 x i1> %[[FCMP1]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-UNORDERED: %[[PRED:.*]] = select <4 x i1> %[[XOR]], <4 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>, <4 x float> %[[PRED_PHI:.*]]
+; CHECK-UNORDERED: %[[VEC_FADD]] = fadd <4 x float> %[[PHI]], %[[PRED]]
+; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd
+; CHECK-UNORDERED: middle.block
+; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> %[[VEC_FADD]])
+; CHECK-UNORDERED: for.body
+; CHECK-UNORDERED: %[[RES_PHI:.*]] = phi float [ %[[MERGE_RDX:.*]], %scalar.ph ], [ %[[FADD:.*]], %for.inc ]
+; CHECK-UNORDERED: %[[LOAD2:.*]] = load float, float*
+; CHECK-UNORDERED: %[[FCMP2:.*]] = fcmp une float %[[LOAD2]], 0.000000e+00
+; CHECK-UNORDERED: br i1 %[[FCMP2]], label %if.then, label %for.inc
+; CHECK-UNORDERED: if.then
+; CHECK-UNORDERED: %[[LOAD3:.*]] = load float, float*
+; CHECK-UNORDERED: for.inc
+; CHECK-UNORDERED: %[[PHI:.*]] = phi float [ %[[LOAD3]], %if.then ], [ 3.000000e+00, %for.body ]
+; CHECK-UNORDERED: %[[FADD]] = fadd float %[[RES_PHI]], %[[PHI]]
+; CHECK-UNORDERED: for.end
+; CHECK-UNORDERED: %[[RDX_PHI:.*]] = phi float [ %[[FADD]], %for.inc ], [ %[[RDX]], %middle.block ]
+; CHECK-UNORDERED: ret float %[[RDX_PHI]]
+
+; CHECK-NOT-VECTORIZED-LABEL: @fadd_conditional
+; CHECK-NOT-VECTORIZED-NOT: vector.body
entry:
br label %for.body
; Test to check masking correct, using the "llvm.loop.vectorize.predicate.enable" attribute
define float @fadd_predicated(float* noalias nocapture %a, i64 %n) {
-; CHECK-NOT-VECTORIZED-LABEL: @fadd_predicated(
-; CHECK-NOT-VECTORIZED-NEXT: entry:
-; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED: for.body:
-; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[SUM_02:%.*]] = phi float [ [[L7:%.*]], [[FOR_BODY]] ], [ 0.000000e+00, [[ENTRY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[L2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT: [[L3:%.*]] = load float, float* [[L2]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[L7]] = fadd float [[SUM_02]], [[L3]]
-; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK-NOT-VECTORIZED: for.end:
-; CHECK-NOT-VECTORIZED-NEXT: [[SUM_0_LCSSA:%.*]] = phi float [ [[L7]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: ret float [[SUM_0_LCSSA]]
-;
-; CHECK-UNORDERED-LABEL: @fadd_predicated(
-; CHECK-UNORDERED-NEXT: entry:
-; CHECK-UNORDERED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-UNORDERED: vector.ph:
-; CHECK-UNORDERED-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], 1
-; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2
-; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
-; CHECK-UNORDERED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i32 0
-; CHECK-UNORDERED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
-; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-UNORDERED: vector.body:
-; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE2:%.*]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE2]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <2 x float> [ <float 0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[PRED_LOAD_CONTINUE2]] ]
-; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = icmp ule <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = extractelement <2 x i1> [[TMP0]], i32 0
-; CHECK-UNORDERED-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; CHECK-UNORDERED: pred.load.if:
-; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0
-; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP2]]
-; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = load float, float* [[TMP3]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP4]], i32 0
-; CHECK-UNORDERED-NEXT: br label [[PRED_LOAD_CONTINUE]]
-; CHECK-UNORDERED: pred.load.continue:
-; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = phi <2 x float> [ poison, [[VECTOR_BODY]] ], [ [[TMP5]], [[PRED_LOAD_IF]] ]
-; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1
-; CHECK-UNORDERED-NEXT: br i1 [[TMP7]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2]]
-; CHECK-UNORDERED: pred.load.if1:
-; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 1
-; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP8]]
-; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = load float, float* [[TMP9]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP10]], i32 1
-; CHECK-UNORDERED-NEXT: br label [[PRED_LOAD_CONTINUE2]]
-; CHECK-UNORDERED: pred.load.continue2:
-; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = phi <2 x float> [ [[TMP6]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP11]], [[PRED_LOAD_IF1]] ]
-; CHECK-UNORDERED-NEXT: [[TMP13]] = fadd <2 x float> [[VEC_PHI]], [[TMP12]]
-; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = select <2 x i1> [[TMP0]], <2 x float> [[TMP13]], <2 x float> [[VEC_PHI]]
-; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2
-; CHECK-UNORDERED-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
-; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
-; CHECK-UNORDERED: middle.block:
-; CHECK-UNORDERED-NEXT: [[TMP16:%.*]] = call float @llvm.vector.reduce.fadd.v2f32(float -0.000000e+00, <2 x float> [[TMP14]])
-; CHECK-UNORDERED-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-UNORDERED: scalar.ph:
-; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED: for.body:
-; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-UNORDERED-NEXT: [[SUM_02:%.*]] = phi float [ [[L7:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-UNORDERED-NEXT: [[L2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT: [[L3:%.*]] = load float, float* [[L2]], align 4
-; CHECK-UNORDERED-NEXT: [[L7]] = fadd float [[SUM_02]], [[L3]]
-; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-; CHECK-UNORDERED: for.end:
-; CHECK-UNORDERED-NEXT: [[SUM_0_LCSSA:%.*]] = phi float [ [[L7]], [[FOR_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: ret float [[SUM_0_LCSSA]]
-;
-; CHECK-ORDERED-LABEL: @fadd_predicated(
-; CHECK-ORDERED-NEXT: entry:
-; CHECK-ORDERED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-ORDERED: vector.ph:
-; CHECK-ORDERED-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], 1
-; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2
-; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-ORDERED-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[N]], 1
-; CHECK-ORDERED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i32 0
-; CHECK-ORDERED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
-; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-ORDERED: vector.body:
-; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE2:%.*]] ]
-; CHECK-ORDERED-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE2]] ]
-; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[PRED_LOAD_CONTINUE2]] ]
-; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = icmp ule <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = extractelement <2 x i1> [[TMP0]], i32 0
-; CHECK-ORDERED-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; CHECK-ORDERED: pred.load.if:
-; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0
-; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP2]]
-; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = load float, float* [[TMP3]], align 4
-; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP4]], i32 0
-; CHECK-ORDERED-NEXT: br label [[PRED_LOAD_CONTINUE]]
-; CHECK-ORDERED: pred.load.continue:
-; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = phi <2 x float> [ poison, [[VECTOR_BODY]] ], [ [[TMP5]], [[PRED_LOAD_IF]] ]
-; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP0]], i32 1
-; CHECK-ORDERED-NEXT: br i1 [[TMP7]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2]]
-; CHECK-ORDERED: pred.load.if1:
-; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 1
-; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP8]]
-; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = load float, float* [[TMP9]], align 4
-; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP10]], i32 1
-; CHECK-ORDERED-NEXT: br label [[PRED_LOAD_CONTINUE2]]
-; CHECK-ORDERED: pred.load.continue2:
-; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = phi <2 x float> [ [[TMP6]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP11]], [[PRED_LOAD_IF1]] ]
-; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = select <2 x i1> [[TMP0]], <2 x float> [[TMP12]], <2 x float> <float -0.000000e+00, float -0.000000e+00>
-; CHECK-ORDERED-NEXT: [[TMP14]] = call float @llvm.vector.reduce.fadd.v2f32(float [[VEC_PHI]], <2 x float> [[TMP13]])
-; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2
-; CHECK-ORDERED-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
-; CHECK-ORDERED-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-ORDERED-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
-; CHECK-ORDERED: middle.block:
-; CHECK-ORDERED-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-ORDERED: scalar.ph:
-; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED: for.body:
-; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-ORDERED-NEXT: [[SUM_02:%.*]] = phi float [ [[L7:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-ORDERED-NEXT: [[L2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-ORDERED-NEXT: [[L3:%.*]] = load float, float* [[L2]], align 4
-; CHECK-ORDERED-NEXT: [[L7]] = fadd float [[SUM_02]], [[L3]]
-; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-ORDERED-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-ORDERED-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-; CHECK-ORDERED: for.end:
-; CHECK-ORDERED-NEXT: [[SUM_0_LCSSA:%.*]] = phi float [ [[L7]], [[FOR_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT: ret float [[SUM_0_LCSSA]]
-;
-
-
+; CHECK-ORDERED-LABEL: @fadd_predicated
+; CHECK-ORDERED: vector.ph
+; CHECK-ORDERED: %[[TRIP_MINUS_ONE:.*]] = sub i64 %n, 1
+; CHECK-ORDERED: %[[BROADCAST_INS:.*]] = insertelement <2 x i64> poison, i64 %[[TRIP_MINUS_ONE]], i32 0
+; CHECK-ORDERED: %[[SPLAT:.*]] = shufflevector <2 x i64> %[[BROADCAST_INS]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-ORDERED: vector.body
+; CHECK-ORDERED: %[[RDX_PHI:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX:.*]], %pred.load.continue2 ]
+; CHECK-ORDERED: pred.load.continue2
+; CHECK-ORDERED: %[[PHI:.*]] = phi <2 x float> [ %[[PHI0:.*]], %pred.load.continue ], [ %[[INS_ELT:.*]], %pred.load.if1 ]
+; CHECK-ORDERED: %[[MASK:.*]] = select <2 x i1> %0, <2 x float> %[[PHI]], <2 x float> <float -0.000000e+00, float -0.000000e+00>
+; CHECK-ORDERED: %[[RDX]] = call float @llvm.vector.reduce.fadd.v2f32(float %[[RDX_PHI]], <2 x float> %[[MASK]])
+; CHECK-ORDERED: for.end:
+; CHECK-ORDERED: %[[RES_PHI:.*]] = phi float [ %[[FADD:.*]], %for.body ], [ %[[RDX]], %middle.block ]
+; CHECK-ORDERED: ret float %[[RES_PHI]]
+
+; CHECK-UNORDERED-LABEL: @fadd_predicated
+; CHECK-UNORDERED: vector.ph
+; CHECK-UNORDERED: %[[TRIP_MINUS_ONE:.*]] = sub i64 %n, 1
+; CHECK-UNORDERED: %[[BROADCAST_INS:.*]] = insertelement <2 x i64> poison, i64 %[[TRIP_MINUS_ONE]], i32 0
+; CHECK-UNORDERED: %[[SPLAT:.*]] = shufflevector <2 x i64> %[[BROADCAST_INS]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-UNORDERED: vector.body
+; CHECK-UNORDERED: %[[RDX_PHI:.*]] = phi <2 x float> [ <float 0.000000e+00, float -0.000000e+00>, %vector.ph ], [ %[[FADD:.*]], %pred.load.continue2 ]
+; CHECK-UNORDERED: %[[ICMP:.*]] = icmp ule <2 x i64> %vec.ind, %[[SPLAT]]
+; CHECK-UNORDERED: pred.load.continue2
+; CHECK-UNORDERED: %[[FADD]] = fadd <2 x float> %[[RDX_PHI]], {{.*}}
+; CHECK-UNORDERED: %[[MASK:.*]] = select <2 x i1> %[[ICMP]], <2 x float> %[[FADD]], <2 x float> %[[RDX_PHI]]
+; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd
+; CHECK-UNORDERED: middle.block
+; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v2f32(float -0.000000e+00, <2 x float> %[[MASK]])
+; CHECK-UNORDERED: for.body
+; CHECK-UNORDERED: %[[LOAD:.*]] = load float, float*
+; CHECK-UNORDERED: %[[FADD2:.*]] = fadd float {{.*}}, %[[LOAD]]
+; CHECK-UNORDERED: for.end
+; CHECK-UNORDERED: %[[SUM:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[RDX]], %middle.block ]
+; CHECK-UNORDERED: ret float %[[SUM]]
+
+; CHECK-NOT-VECTORIZED-LABEL: @fadd_predicated
+; CHECK-NOT-VECTORIZED-NOT: vector.body
entry:
br label %for.body
; Negative test - loop contains multiple fadds which we cannot safely reorder
define float @fadd_multiple(float* noalias nocapture %a, float* noalias nocapture %b, i64 %n) {
-; CHECK-NOT-VECTORIZED-LABEL: @fadd_multiple(
-; CHECK-NOT-VECTORIZED-NEXT: entry:
-; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED: for.body:
-; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[SUM:%.*]] = phi float [ -0.000000e+00, [[ENTRY]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[ADD:%.*]] = fadd float [[SUM]], [[TMP0]]
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[ADD3]] = fadd float [[ADD]], [[TMP1]]
-; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]]
-; CHECK-NOT-VECTORIZED: for.end:
-; CHECK-NOT-VECTORIZED-NEXT: [[RDX:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: ret float [[RDX]]
-;
-; CHECK-UNORDERED-LABEL: @fadd_multiple(
-; CHECK-UNORDERED-NEXT: entry:
-; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-UNORDERED: vector.ph:
-; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-UNORDERED: vector.body:
-; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
-; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
-; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = fadd <8 x float> [[VEC_PHI]], [[WIDE_LOAD]]
-; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
-; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0
-; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP7]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP8]] = fadd <8 x float> [[TMP4]], [[WIDE_LOAD1]]
-; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
-; CHECK-UNORDERED: middle.block:
-; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP8]])
-; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-UNORDERED: scalar.ph:
-; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ -0.000000e+00, [[ENTRY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED: for.body:
-; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[SUM:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT: [[ADD:%.*]] = fadd float [[SUM]], [[TMP11]]
-; CHECK-UNORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-UNORDERED-NEXT: [[ADD3]] = fadd float [[ADD]], [[TMP12]]
-; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
-; CHECK-UNORDERED: for.end:
-; CHECK-UNORDERED-NEXT: [[RDX:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: ret float [[RDX]]
-;
-; CHECK-ORDERED-LABEL: @fadd_multiple(
-; CHECK-ORDERED-NEXT: entry:
-; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED: for.body:
-; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[SUM:%.*]] = phi float [ -0.000000e+00, [[ENTRY]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT: [[ADD:%.*]] = fadd float [[SUM]], [[TMP0]]
-; CHECK-ORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-ORDERED-NEXT: [[ADD3]] = fadd float [[ADD]], [[TMP1]]
-; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
-; CHECK-ORDERED: for.end:
-; CHECK-ORDERED-NEXT: [[RDX:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: ret float [[RDX]]
-;
-
-
+; CHECK-ORDERED-LABEL: @fadd_multiple
+; CHECK-ORDERED-NOT: vector.body
+
+; CHECK-UNORDERED-LABEL: @fadd_multiple
+; CHECK-UNORDERED: vector.body
+; CHECK-UNORDERED: %[[PHI:.*]] = phi <8 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ %[[VEC_FADD2:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[VEC_LOAD1:.*]] = load <8 x float>, <8 x float>
+; CHECK-UNORDERED: %[[VEC_FADD1:.*]] = fadd <8 x float> %[[PHI]], %[[VEC_LOAD1]]
+; CHECK-UNORDERED: %[[VEC_LOAD2:.*]] = load <8 x float>, <8 x float>
+; CHECK-UNORDERED: %[[VEC_FADD2]] = fadd <8 x float> %[[VEC_FADD1]], %[[VEC_LOAD2]]
+; CHECK-UNORDERED: middle.block
+; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> %[[VEC_FADD2]])
+; CHECK-UNORDERED: for.body
+; CHECK-UNORDERED: %[[SUM:.*]] = phi float [ %bc.merge.rdx, %scalar.ph ], [ %[[FADD2:.*]], %for.body ]
+; CHECK-UNORDERED: %[[LOAD1:.*]] = load float, float*
+; CHECK-UNORDERED: %[[FADD1:.*]] = fadd float %sum, %[[LOAD1]]
+; CHECK-UNORDERED: %[[LOAD2:.*]] = load float, float*
+; CHECK-UNORDERED: %[[FADD2]] = fadd float %[[FADD1]], %[[LOAD2]]
+; CHECK-UNORDERED: for.end
+; CHECK-UNORDERED: %[[RET:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[RDX]], %middle.block ]
+; CHECK-UNORDERED: ret float %[[RET]]
+
+; CHECK-NOT-VECTORIZED-LABEL: @fadd_multiple
+; CHECK-NOT-VECTORIZED-NOT: vector.body
entry:
br label %for.body
; Negative test - loop contains two fadds and only one fadd has the fast flag,
; which we cannot safely reorder.
define float @fadd_multiple_one_flag(float* noalias nocapture %a, float* noalias nocapture %b, i64 %n) {
-; CHECK-NOT-VECTORIZED-LABEL: @fadd_multiple_one_flag(
-; CHECK-NOT-VECTORIZED-NEXT: entry:
-; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED: for.body:
-; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[SUM:%.*]] = phi float [ -0.000000e+00, [[ENTRY]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[ADD:%.*]] = fadd float [[SUM]], [[TMP0]]
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[ADD3]] = fadd fast float [[ADD]], [[TMP1]]
-; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0]]
-; CHECK-NOT-VECTORIZED: for.end:
-; CHECK-NOT-VECTORIZED-NEXT: [[RDX:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: ret float [[RDX]]
-;
-; CHECK-UNORDERED-LABEL: @fadd_multiple_one_flag(
-; CHECK-UNORDERED-NEXT: entry:
-; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-UNORDERED: vector.ph:
-; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-UNORDERED: vector.body:
-; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
-; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
-; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP3]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = fadd <8 x float> [[VEC_PHI]], [[WIDE_LOAD]]
-; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
-; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0
-; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP7]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP8]] = fadd fast <8 x float> [[TMP4]], [[WIDE_LOAD1]]
-; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
-; CHECK-UNORDERED: middle.block:
-; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP8]])
-; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-UNORDERED: scalar.ph:
-; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ -0.000000e+00, [[ENTRY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED: for.body:
-; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[SUM:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT: [[ADD:%.*]] = fadd float [[SUM]], [[TMP11]]
-; CHECK-UNORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-UNORDERED-NEXT: [[ADD3]] = fadd fast float [[ADD]], [[TMP12]]
-; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
-; CHECK-UNORDERED: for.end:
-; CHECK-UNORDERED-NEXT: [[RDX:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: ret float [[RDX]]
-;
-; CHECK-ORDERED-LABEL: @fadd_multiple_one_flag(
-; CHECK-ORDERED-NEXT: entry:
-; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED: for.body:
-; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[SUM:%.*]] = phi float [ -0.000000e+00, [[ENTRY]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT: [[ADD:%.*]] = fadd float [[SUM]], [[TMP0]]
-; CHECK-ORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-ORDERED-NEXT: [[ADD3]] = fadd fast float [[ADD]], [[TMP1]]
-; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP18]]
-; CHECK-ORDERED: for.end:
-; CHECK-ORDERED-NEXT: [[RDX:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: ret float [[RDX]]
-;
-
-
+; CHECK-ORDERED-LABEL: @fadd_multiple_one_flag
+; CHECK-ORDERED-NOT: vector.body
+
+; CHECK-UNORDERED-LABEL: @fadd_multiple_one_flag
+; CHECK-UNORDERED: vector.body
+; CHECK-UNORDERED: %[[PHI:.*]] = phi <8 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ %[[VEC_FADD2:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[VEC_LOAD1:.*]] = load <8 x float>, <8 x float>
+; CHECK-UNORDERED: %[[VEC_FADD1:.*]] = fadd <8 x float> %[[PHI]], %[[VEC_LOAD1]]
+; CHECK-UNORDERED: %[[VEC_LOAD2:.*]] = load <8 x float>, <8 x float>
+; CHECK-UNORDERED: %[[VEC_FADD2]] = fadd fast <8 x float> %[[VEC_FADD1]], %[[VEC_LOAD2]]
+; CHECK-UNORDERED: middle.block
+; CHECK-UNORDERED: %[[RDX:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> %[[VEC_FADD2]])
+; CHECK-UNORDERED: for.body
+; CHECK-UNORDERED: %[[SUM:.*]] = phi float [ %bc.merge.rdx, %scalar.ph ], [ %[[FADD2:.*]], %for.body ]
+; CHECK-UNORDERED: %[[LOAD1:.*]] = load float, float*
+; CHECK-UNORDERED: %[[FADD1:.*]] = fadd float %sum, %[[LOAD1]]
+; CHECK-UNORDERED: %[[LOAD2:.*]] = load float, float*
+; CHECK-UNORDERED: %[[FADD2]] = fadd fast float %[[FADD1]], %[[LOAD2]]
+; CHECK-UNORDERED: for.end
+; CHECK-UNORDERED: %[[RET:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[RDX]], %middle.block ]
+; CHECK-UNORDERED: ret float %[[RET]]
+
+; CHECK-NOT-VECTORIZED-LABEL: @fadd_multiple_one_flag
+; CHECK-NOT-VECTORIZED-NOT: vector.body
entry:
br label %for.body
; Note: This test does not use metadata hints, and as such we should not expect the CHECK-UNORDERED case to vectorize, even
; with the -hints-allow-reordering flag set to true.
define float @induction_and_reduction(float* nocapture readonly %values, float %init, float* noalias nocapture %A, i64 %N) {
-; CHECK-NOT-VECTORIZED-LABEL: @induction_and_reduction(
-; CHECK-NOT-VECTORIZED-NEXT: entry:
-; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED: for.body:
-; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[SUM_015:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[X_014:%.*]] = phi float [ [[INIT:%.*]], [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT: store float [[X_014]], float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[ADD]] = fadd float [[X_014]], 2.000000e+00
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[VALUES:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[ADD3]] = fadd float [[SUM_015]], [[TMP0]]
-; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK-NOT-VECTORIZED: for.end:
-; CHECK-NOT-VECTORIZED-NEXT: [[ADD3_LCSSA:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: ret float [[ADD3_LCSSA]]
-;
-; CHECK-UNORDERED-LABEL: @induction_and_reduction(
-; CHECK-UNORDERED-NEXT: entry:
-; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED: for.body:
-; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[SUM_015:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[X_014:%.*]] = phi float [ [[INIT:%.*]], [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT: store float [[X_014]], float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT: [[ADD]] = fadd float [[X_014]], 2.000000e+00
-; CHECK-UNORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[VALUES:%.*]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-UNORDERED-NEXT: [[ADD3]] = fadd float [[SUM_015]], [[TMP0]]
-; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK-UNORDERED: for.end:
-; CHECK-UNORDERED-NEXT: [[ADD3_LCSSA:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: ret float [[ADD3_LCSSA]]
-;
-; CHECK-ORDERED-LABEL: @induction_and_reduction(
-; CHECK-ORDERED-NEXT: entry:
-; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED: for.body:
-; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[SUM_015:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[X_014:%.*]] = phi float [ [[INIT:%.*]], [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-ORDERED-NEXT: store float [[X_014]], float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT: [[ADD]] = fadd float [[X_014]], 2.000000e+00
-; CHECK-ORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[VALUES:%.*]], i64 [[IV]]
-; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-ORDERED-NEXT: [[ADD3]] = fadd float [[SUM_015]], [[TMP0]]
-; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK-ORDERED: for.end:
-; CHECK-ORDERED-NEXT: [[ADD3_LCSSA:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: ret float [[ADD3_LCSSA]]
-;
+; CHECK-ORDERED-LABEL: @induction_and_reduction
+; CHECK-ORDERED-NOT: vector.body
+; CHECK-UNORDERED-LABEL: @induction_and_reduction
+; CHECK-UNORDERED-NOT: vector.body
+; CHECK-NOT-VECTORIZED-LABEL: @induction_and_reduction
+; CHECK-NOT-VECTORIZED-NOT: vector.body
entry:
br label %for.body
; As above, but with the FP induction being unordered (fast) the loop can be vectorized with strict reductions
define float @fast_induction_and_reduction(float* nocapture readonly %values, float %init, float* noalias nocapture %A, i64 %N) {
-; CHECK-NOT-VECTORIZED-LABEL: @fast_induction_and_reduction(
-; CHECK-NOT-VECTORIZED-NEXT: entry:
-; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED: for.body:
-; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[SUM_015:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[X_014:%.*]] = phi fast float [ [[INIT:%.*]], [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT: store float [[X_014]], float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[ADD]] = fadd fast float [[X_014]], 2.000000e+00
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[VALUES:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[ADD3]] = fadd float [[SUM_015]], [[TMP0]]
-; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP6]]
-; CHECK-NOT-VECTORIZED: for.end:
-; CHECK-NOT-VECTORIZED-NEXT: [[ADD3_LCSSA:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: ret float [[ADD3_LCSSA]]
-;
-; CHECK-UNORDERED-LABEL: @fast_induction_and_reduction(
-; CHECK-UNORDERED-NEXT: entry:
-; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-UNORDERED: vector.ph:
-; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT: [[CAST_CRD:%.*]] = sitofp i64 [[N_VEC]] to float
-; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = fmul fast float 2.000000e+00, [[CAST_CRD]]
-; CHECK-UNORDERED-NEXT: [[IND_END:%.*]] = fadd fast float [[INIT:%.*]], [[TMP0]]
-; CHECK-UNORDERED-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i32 0
-; CHECK-UNORDERED-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
-; CHECK-UNORDERED-NEXT: [[INDUCTION:%.*]] = fadd fast <4 x float> [[DOTSPLAT]], <float 0.000000e+00, float 2.000000e+00, float 4.000000e+00, float 6.000000e+00>
-; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-UNORDERED: vector.body:
-; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_IND:%.*]] = phi <4 x float> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0
-; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP1]]
-; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = bitcast float* [[TMP3]] to <4 x float>*
-; CHECK-UNORDERED-NEXT: store <4 x float> [[VEC_IND]], <4 x float>* [[TMP4]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[VALUES:%.*]], i64 [[TMP1]]
-; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0
-; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP8]] = fadd <4 x float> [[VEC_PHI]], [[WIDE_LOAD]]
-; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-UNORDERED-NEXT: [[VEC_IND_NEXT]] = fadd fast <4 x float> [[VEC_IND]], <float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00>
-; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
-; CHECK-UNORDERED: middle.block:
-; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP8]])
-; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-UNORDERED: scalar.ph:
-; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL1:%.*]] = phi float [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[INIT]], [[ENTRY]] ]
-; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED: for.body:
-; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[SUM_015:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[X_014:%.*]] = phi fast float [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT: store float [[X_014]], float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT: [[ADD]] = fadd fast float [[X_014]], 2.000000e+00
-; CHECK-UNORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[VALUES]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-UNORDERED-NEXT: [[ADD3]] = fadd float [[SUM_015]], [[TMP11]]
-; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
-; CHECK-UNORDERED: for.end:
-; CHECK-UNORDERED-NEXT: [[ADD3_LCSSA:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: ret float [[ADD3_LCSSA]]
-;
-; CHECK-ORDERED-LABEL: @fast_induction_and_reduction(
-; CHECK-ORDERED-NEXT: entry:
-; CHECK-ORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-ORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-ORDERED: vector.ph:
-; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-ORDERED-NEXT: [[CAST_CRD:%.*]] = sitofp i64 [[N_VEC]] to float
-; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = fmul fast float 2.000000e+00, [[CAST_CRD]]
-; CHECK-ORDERED-NEXT: [[IND_END:%.*]] = fadd fast float [[INIT:%.*]], [[TMP0]]
-; CHECK-ORDERED-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[INIT]], i32 0
-; CHECK-ORDERED-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
-; CHECK-ORDERED-NEXT: [[INDUCTION:%.*]] = fadd fast <4 x float> [[DOTSPLAT]], <float 0.000000e+00, float 2.000000e+00, float 4.000000e+00, float 6.000000e+00>
-; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-ORDERED: vector.body:
-; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[VEC_IND:%.*]] = phi <4 x float> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0
-; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP1]]
-; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = bitcast float* [[TMP3]] to <4 x float>*
-; CHECK-ORDERED-NEXT: store <4 x float> [[VEC_IND]], <4 x float>* [[TMP4]], align 4
-; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[VALUES:%.*]], i64 [[TMP1]]
-; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0
-; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
-; CHECK-ORDERED-NEXT: [[TMP8]] = call float @llvm.vector.reduce.fadd.v4f32(float [[VEC_PHI]], <4 x float> [[WIDE_LOAD]])
-; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-ORDERED-NEXT: [[VEC_IND_NEXT]] = fadd fast <4 x float> [[VEC_IND]], <float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00>
-; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-ORDERED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
-; CHECK-ORDERED: middle.block:
-; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-ORDERED: scalar.ph:
-; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL1:%.*]] = phi float [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[INIT]], [[ENTRY]] ]
-; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED: for.body:
-; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[SUM_015:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[X_014:%.*]] = phi fast float [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-ORDERED-NEXT: store float [[X_014]], float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT: [[ADD]] = fadd fast float [[X_014]], 2.000000e+00
-; CHECK-ORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[VALUES]], i64 [[IV]]
-; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-ORDERED-NEXT: [[ADD3]] = fadd float [[SUM_015]], [[TMP10]]
-; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
-; CHECK-ORDERED: for.end:
-; CHECK-ORDERED-NEXT: [[ADD3_LCSSA:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT: ret float [[ADD3_LCSSA]]
-;
-
-
+; CHECK-ORDERED-LABEL: @fast_induction_and_reduction
+; CHECK-ORDERED: vector.ph
+; CHECK-ORDERED: %[[INDUCTION:.*]] = fadd fast <4 x float> {{.*}}, <float 0.000000e+00, float 2.000000e+00, float 4.000000e+00, float 6.000000e+00>
+; CHECK-ORDERED: vector.body
+; CHECK-ORDERED: %[[RDX_PHI:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[FADD2:.*]], %vector.body ]
+; CHECK-ORDERED: %[[IND_PHI:.*]] = phi <4 x float> [ %[[INDUCTION]], %vector.ph ], [ %[[VEC_IND_NEXT:.*]], %vector.body ]
+; CHECK-ORDERED: %[[LOAD1:.*]] = load <4 x float>, <4 x float>*
+; CHECK-ORDERED: %[[FADD1:.*]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[RDX_PHI]], <4 x float> %[[LOAD1]])
+; CHECK-ORDERED: %[[VEC_IND_NEXT]] = fadd fast <4 x float> %[[IND_PHI]], <float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00>
+; CHECK-ORDERED: for.body
+; CHECK-ORDERED: %[[RDX_SUM_PHI:.*]] = phi float [ {{.*}}, %scalar.ph ], [ %[[FADD2:.*]], %for.body ]
+; CHECK-ORDERED: %[[IND_SUM_PHI:.*]] = phi fast float [ {{.*}}, %scalar.ph ], [ %[[ADD_IND:.*]], %for.body ]
+; CHECK-ORDERED: store float %[[IND_SUM_PHI]], float*
+; CHECK-ORDERED: %[[ADD_IND]] = fadd fast float %[[IND_SUM_PHI]], 2.000000e+00
+; CHECK-ORDERED: %[[LOAD2:.*]] = load float, float*
+; CHECK-ORDERED: %[[FADD2]] = fadd float %[[RDX_SUM_PHI]], %[[LOAD2]]
+; CHECK-ORDERED: for.end
+; CHECK-ORDERED: %[[RES_PHI:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[FADD1]], %middle.block ]
+; CHECK-ORDERED: ret float %[[RES_PHI]]
+
+; CHECK-UNORDERED-LABEL: @fast_induction_and_reduction
+; CHECK-UNORDERED: vector.ph
+; CHECK-UNORDERED: %[[INDUCTION:.*]] = fadd fast <4 x float> {{.*}}, <float 0.000000e+00, float 2.000000e+00, float 4.000000e+00, float 6.000000e+00>
+; CHECK-UNORDERED: vector.body
+; CHECK-UNORDERED: %[[RDX_PHI:.*]] = phi <4 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ %[[VEC_FADD:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[IND_PHI:.*]] = phi <4 x float> [ %[[INDUCTION]], %vector.ph ], [ %[[VEC_IND_NEXT:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[LOAD1:.*]] = load <4 x float>, <4 x float>*
+; CHECK-UNORDERED: %[[VEC_FADD]] = fadd <4 x float> %[[RDX_PHI]], %[[LOAD1]]
+; CHECK-UNORDERED: %[[VEC_IND_NEXT]] = fadd fast <4 x float> %[[IND_PHI]], <float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00>
+; CHECK-UNORDERED: middle.block:
+; CHECK-UNORDERED: %[[VEC_RDX:.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> %[[VEC_FADD]])
+; CHECK-UNORDERED: for.body:
+; CHECK-UNORDERED: %[[RDX_SUM_PHI:.*]] = phi float [ {{.*}}, %scalar.ph ], [ %[[FADD:.*]], %for.body ]
+; CHECK-UNORDERED: %[[IND_SUM_PHI:.*]] = phi fast float [ {{.*}}, %scalar.ph ], [ %[[ADD_IND:.*]], %for.body ]
+; CHECK-UNORDERED: store float %[[IND_SUM_PHI]], float*
+; CHECK-UNORDERED: %[[ADD_IND]] = fadd fast float %[[IND_SUM_PHI]], 2.000000e+00
+; CHECK-UNORDERED: %[[LOAD2:.*]] = load float, float*
+; CHECK-UNORDERED: %[[FADD]] = fadd float %[[RDX_SUM_PHI]], %[[LOAD2]]
+; CHECK-UNORDERED: for.end
+; CHECK-UNORDERED: %[[RES_PHI:.*]] = phi float [ %[[FADD]], %for.body ], [ %[[VEC_RDX]], %middle.block ]
+; CHECK-UNORDERED: ret float %[[RES_PHI]]
+
+; CHECK-NOT-VECTORIZED-LABEL: @fast_induction_and_reduction
+; CHECK-NOT-VECTORIZED-NOT: vector.body
entry:
br label %for.body
; Note: This test does not use metadata hints, and as such we should not expect the CHECK-UNORDERED case to vectorize, even
; with the -hints-allow-reordering flag set to true.
define float @fast_induction_unordered_reduction(float* nocapture readonly %values, float %init, float* noalias nocapture %A, float* noalias nocapture %B, i64 %N) {
-; CHECK-NOT-VECTORIZED-LABEL: @fast_induction_unordered_reduction(
-; CHECK-NOT-VECTORIZED-NEXT: entry:
-; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED: for.body:
-; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[SUM2_023:%.*]] = phi float [ 3.000000e+00, [[ENTRY]] ], [ [[MUL:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[SUM_022:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[X_021:%.*]] = phi float [ [[INIT:%.*]], [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT: store float [[X_021]], float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[ADD]] = fadd fast float [[X_021]], 2.000000e+00
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[VALUES:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[ADD3]] = fadd float [[SUM_022]], [[TMP0]]
-; CHECK-NOT-VECTORIZED-NEXT: [[MUL]] = fmul float [[SUM2_023]], [[TMP0]]
-; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK-NOT-VECTORIZED: for.end:
-; CHECK-NOT-VECTORIZED-NEXT: [[ADD3_LCSSA:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[MUL_LCSSA:%.*]] = phi float [ [[MUL]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[ADD6:%.*]] = fadd float [[ADD3_LCSSA]], [[MUL_LCSSA]]
-; CHECK-NOT-VECTORIZED-NEXT: ret float [[ADD6]]
-;
-; CHECK-UNORDERED-LABEL: @fast_induction_unordered_reduction(
-; CHECK-UNORDERED-NEXT: entry:
-; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED: for.body:
-; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[SUM2_023:%.*]] = phi float [ 3.000000e+00, [[ENTRY]] ], [ [[MUL:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[SUM_022:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[X_021:%.*]] = phi float [ [[INIT:%.*]], [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT: store float [[X_021]], float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT: [[ADD]] = fadd fast float [[X_021]], 2.000000e+00
-; CHECK-UNORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[VALUES:%.*]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-UNORDERED-NEXT: [[ADD3]] = fadd float [[SUM_022]], [[TMP0]]
-; CHECK-UNORDERED-NEXT: [[MUL]] = fmul float [[SUM2_023]], [[TMP0]]
-; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK-UNORDERED: for.end:
-; CHECK-UNORDERED-NEXT: [[ADD3_LCSSA:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[MUL_LCSSA:%.*]] = phi float [ [[MUL]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[ADD6:%.*]] = fadd float [[ADD3_LCSSA]], [[MUL_LCSSA]]
-; CHECK-UNORDERED-NEXT: ret float [[ADD6]]
-;
-; CHECK-ORDERED-LABEL: @fast_induction_unordered_reduction(
-; CHECK-ORDERED-NEXT: entry:
-; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED: for.body:
-; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[SUM2_023:%.*]] = phi float [ 3.000000e+00, [[ENTRY]] ], [ [[MUL:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[SUM_022:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[X_021:%.*]] = phi float [ [[INIT:%.*]], [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-ORDERED-NEXT: store float [[X_021]], float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT: [[ADD]] = fadd fast float [[X_021]], 2.000000e+00
-; CHECK-ORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[VALUES:%.*]], i64 [[IV]]
-; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-ORDERED-NEXT: [[ADD3]] = fadd float [[SUM_022]], [[TMP0]]
-; CHECK-ORDERED-NEXT: [[MUL]] = fmul float [[SUM2_023]], [[TMP0]]
-; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK-ORDERED: for.end:
-; CHECK-ORDERED-NEXT: [[ADD3_LCSSA:%.*]] = phi float [ [[ADD3]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[MUL_LCSSA:%.*]] = phi float [ [[MUL]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[ADD6:%.*]] = fadd float [[ADD3_LCSSA]], [[MUL_LCSSA]]
-; CHECK-ORDERED-NEXT: ret float [[ADD6]]
-;
+; CHECK-ORDERED-LABEL: @fast_induction_unordered_reduction
+; CHECK-ORDERED-NOT: vector.body
+
+; CHECK-UNORDERED-LABEL: @fast_induction_unordered_reduction
+; CHECK-UNORDERED-NOT: vector.body
+; CHECK-NOT-VECTORIZED-LABEL: @fast_induction_unordered_reduction
+; CHECK-NOT-VECTORIZED-NOT: vector.body
entry:
br label %for.body
; Test reductions for a VF of 1 and a UF > 1.
define float @fadd_scalar_vf(float* noalias nocapture readonly %a, i64 %n) {
-; CHECK-NOT-VECTORIZED-LABEL: @fadd_scalar_vf(
-; CHECK-NOT-VECTORIZED-NEXT: entry:
-; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED: for.body:
-; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[ADD]] = fadd float [[TMP0]], [[SUM_07]]
-; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK-NOT-VECTORIZED: for.end:
-; CHECK-NOT-VECTORIZED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: ret float [[ADD_LCSSA]]
-;
-; CHECK-UNORDERED-LABEL: @fadd_scalar_vf(
-; CHECK-UNORDERED-NEXT: entry:
-; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-UNORDERED: vector.ph:
-; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-UNORDERED: vector.body:
-; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI1:%.*]] = phi float [ -0.000000e+00, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI2:%.*]] = phi float [ -0.000000e+00, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI3:%.*]] = phi float [ -0.000000e+00, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[INDUCTION:%.*]] = add i64 [[INDEX]], 0
-; CHECK-UNORDERED-NEXT: [[INDUCTION4:%.*]] = add i64 [[INDEX]], 1
-; CHECK-UNORDERED-NEXT: [[INDUCTION5:%.*]] = add i64 [[INDEX]], 2
-; CHECK-UNORDERED-NEXT: [[INDUCTION6:%.*]] = add i64 [[INDEX]], 3
-; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDUCTION]]
-; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION4]]
-; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION5]]
-; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION6]]
-; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = load float, float* [[TMP0]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = load float, float* [[TMP1]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = load float, float* [[TMP2]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = load float, float* [[TMP3]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP8]] = fadd float [[TMP4]], [[VEC_PHI]]
-; CHECK-UNORDERED-NEXT: [[TMP9]] = fadd float [[TMP5]], [[VEC_PHI1]]
-; CHECK-UNORDERED-NEXT: [[TMP10]] = fadd float [[TMP6]], [[VEC_PHI2]]
-; CHECK-UNORDERED-NEXT: [[TMP11]] = fadd float [[TMP7]], [[VEC_PHI3]]
-; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
-; CHECK-UNORDERED: middle.block:
-; CHECK-UNORDERED-NEXT: [[BIN_RDX:%.*]] = fadd float [[TMP9]], [[TMP8]]
-; CHECK-UNORDERED-NEXT: [[BIN_RDX7:%.*]] = fadd float [[TMP10]], [[BIN_RDX]]
-; CHECK-UNORDERED-NEXT: [[BIN_RDX8:%.*]] = fadd float [[TMP11]], [[BIN_RDX7]]
-; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-UNORDERED: scalar.ph:
-; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[BIN_RDX8]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED: for.body:
-; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT: [[ADD]] = fadd float [[TMP13]], [[SUM_07]]
-; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
-; CHECK-UNORDERED: for.end:
-; CHECK-UNORDERED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[BIN_RDX8]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: ret float [[ADD_LCSSA]]
-;
-; CHECK-ORDERED-LABEL: @fadd_scalar_vf(
-; CHECK-ORDERED-NEXT: entry:
-; CHECK-ORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-ORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-ORDERED: vector.ph:
-; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-ORDERED: vector.body:
-; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[INDUCTION:%.*]] = add i64 [[INDEX]], 0
-; CHECK-ORDERED-NEXT: [[INDUCTION1:%.*]] = add i64 [[INDEX]], 1
-; CHECK-ORDERED-NEXT: [[INDUCTION2:%.*]] = add i64 [[INDEX]], 2
-; CHECK-ORDERED-NEXT: [[INDUCTION3:%.*]] = add i64 [[INDEX]], 3
-; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDUCTION]]
-; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION1]]
-; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION2]]
-; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION3]]
-; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = load float, float* [[TMP0]], align 4
-; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = load float, float* [[TMP1]], align 4
-; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = load float, float* [[TMP2]], align 4
-; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = load float, float* [[TMP3]], align 4
-; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = fadd float [[VEC_PHI]], [[TMP4]]
-; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = fadd float [[TMP8]], [[TMP5]]
-; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = fadd float [[TMP9]], [[TMP6]]
-; CHECK-ORDERED-NEXT: [[TMP11]] = fadd float [[TMP10]], [[TMP7]]
-; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-ORDERED-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
-; CHECK-ORDERED: middle.block:
-; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-ORDERED: scalar.ph:
-; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED: for.body:
-; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT: [[ADD]] = fadd float [[TMP13]], [[SUM_07]]
-; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
-; CHECK-ORDERED: for.end:
-; CHECK-ORDERED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT: ret float [[ADD_LCSSA]]
-;
-
-
+; CHECK-ORDERED-LABEL: @fadd_scalar_vf
+; CHECK-ORDERED: vector.body
+; CHECK-ORDERED: %[[VEC_PHI:.*]] = phi float [ 0.000000e+00, {{.*}} ], [ %[[FADD4:.*]], %vector.body ]
+; CHECK-ORDERED: %[[LOAD1:.*]] = load float, float*
+; CHECK-ORDERED: %[[LOAD2:.*]] = load float, float*
+; CHECK-ORDERED: %[[LOAD3:.*]] = load float, float*
+; CHECK-ORDERED: %[[LOAD4:.*]] = load float, float*
+; CHECK-ORDERED: %[[FADD1:.*]] = fadd float %[[VEC_PHI]], %[[LOAD1]]
+; CHECK-ORDERED: %[[FADD2:.*]] = fadd float %[[FADD1]], %[[LOAD2]]
+; CHECK-ORDERED: %[[FADD3:.*]] = fadd float %[[FADD2]], %[[LOAD3]]
+; CHECK-ORDERED: %[[FADD4]] = fadd float %[[FADD3]], %[[LOAD4]]
+; CHECK-ORDERED-NOT: call float @llvm.vector.reduce.fadd
+; CHECK-ORDERED: scalar.ph
+; CHECK-ORDERED: %[[MERGE_RDX:.*]] = phi float [ 0.000000e+00, %entry ], [ %[[FADD4]], %middle.block ]
+; CHECK-ORDERED: for.body
+; CHECK-ORDERED: %[[SUM_PHI:.*]] = phi float [ %[[MERGE_RDX]], %scalar.ph ], [ %[[FADD5:.*]], %for.body ]
+; CHECK-ORDERED: %[[LOAD5:.*]] = load float, float*
+; CHECK-ORDERED: %[[FADD5]] = fadd float %[[LOAD5]], %[[SUM_PHI]]
+; CHECK-ORDERED: for.end
+; CHECK-ORDERED: %[[RES_PHI:.*]] = phi float [ %[[FADD5]], %for.body ], [ %[[FADD4]], %middle.block ]
+; CHECK-ORDERED: ret float %[[RES_PHI]]
+
+; CHECK-UNORDERED-LABEL: @fadd_scalar_vf
+; CHECK-UNORDERED: vector.body
+; CHECK-UNORDERED: %[[VEC_PHI1:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[FADD1:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[VEC_PHI2:.*]] = phi float [ -0.000000e+00, %vector.ph ], [ %[[FADD2:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[VEC_PHI3:.*]] = phi float [ -0.000000e+00, %vector.ph ], [ %[[FADD3:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[VEC_PHI4:.*]] = phi float [ -0.000000e+00, %vector.ph ], [ %[[FADD4:.*]], %vector.body ]
+; CHECK-UNORDERED: %[[LOAD1:.*]] = load float, float*
+; CHECK-UNORDERED: %[[LOAD2:.*]] = load float, float*
+; CHECK-UNORDERED: %[[LOAD3:.*]] = load float, float*
+; CHECK-UNORDERED: %[[LOAD4:.*]] = load float, float*
+; CHECK-UNORDERED: %[[FADD1]] = fadd float %[[LOAD1]], %[[VEC_PHI1]]
+; CHECK-UNORDERED: %[[FADD2]] = fadd float %[[LOAD2]], %[[VEC_PHI2]]
+; CHECK-UNORDERED: %[[FADD3]] = fadd float %[[LOAD3]], %[[VEC_PHI3]]
+; CHECK-UNORDERED: %[[FADD4]] = fadd float %[[LOAD4]], %[[VEC_PHI4]]
+; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd
+; CHECK-UNORDERED: middle.block
+; CHECK-UNORDERED: %[[BIN_RDX1:.*]] = fadd float %[[FADD2]], %[[FADD1]]
+; CHECK-UNORDERED: %[[BIN_RDX2:.*]] = fadd float %[[FADD3]], %[[BIN_RDX1]]
+; CHECK-UNORDERED: %[[BIN_RDX3:.*]] = fadd float %[[FADD4]], %[[BIN_RDX2]]
+; CHECK-UNORDERED: scalar.ph
+; CHECK-UNORDERED: %[[MERGE_RDX:.*]] = phi float [ 0.000000e+00, %entry ], [ %[[BIN_RDX3]], %middle.block ]
+; CHECK-UNORDERED: for.body
+; CHECK-UNORDERED: %[[SUM_PHI:.*]] = phi float [ %[[MERGE_RDX]], %scalar.ph ], [ %[[FADD5:.*]], %for.body ]
+; CHECK-UNORDERED: %[[LOAD5:.*]] = load float, float*
+; CHECK-UNORDERED: %[[FADD5]] = fadd float %[[LOAD5]], %[[SUM_PHI]]
+; CHECK-UNORDERED: for.end
+; CHECK-UNORDERED: %[[RES_PHI:.*]] = phi float [ %[[FADD5]], %for.body ], [ %[[BIN_RDX3]], %middle.block ]
+; CHECK-UNORDERED: ret float %[[RES_PHI]]
+
+; CHECK-NOT-VECTORIZED-LABEL: @fadd_scalar_vf
+; CHECK-NOT-VECTORIZED-NOT: @vector.body
entry:
br label %for.body
; Same as above but where fadd has a fast-math flag.
define float @fadd_scalar_vf_fmf(float* noalias nocapture readonly %a, i64 %n) {
-; CHECK-NOT-VECTORIZED-LABEL: @fadd_scalar_vf_fmf(
-; CHECK-NOT-VECTORIZED-NEXT: entry:
-; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED: for.body:
-; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[ADD]] = fadd nnan float [[TMP0]], [[SUM_07]]
-; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP11]]
-; CHECK-NOT-VECTORIZED: for.end:
-; CHECK-NOT-VECTORIZED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: ret float [[ADD_LCSSA]]
-;
-; CHECK-UNORDERED-LABEL: @fadd_scalar_vf_fmf(
-; CHECK-UNORDERED-NEXT: entry:
-; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-UNORDERED: vector.ph:
-; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-UNORDERED: vector.body:
-; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI1:%.*]] = phi float [ -0.000000e+00, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI2:%.*]] = phi float [ -0.000000e+00, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI3:%.*]] = phi float [ -0.000000e+00, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[INDUCTION:%.*]] = add i64 [[INDEX]], 0
-; CHECK-UNORDERED-NEXT: [[INDUCTION4:%.*]] = add i64 [[INDEX]], 1
-; CHECK-UNORDERED-NEXT: [[INDUCTION5:%.*]] = add i64 [[INDEX]], 2
-; CHECK-UNORDERED-NEXT: [[INDUCTION6:%.*]] = add i64 [[INDEX]], 3
-; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDUCTION]]
-; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION4]]
-; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION5]]
-; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION6]]
-; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = load float, float* [[TMP0]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = load float, float* [[TMP1]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = load float, float* [[TMP2]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = load float, float* [[TMP3]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP8]] = fadd nnan float [[TMP4]], [[VEC_PHI]]
-; CHECK-UNORDERED-NEXT: [[TMP9]] = fadd nnan float [[TMP5]], [[VEC_PHI1]]
-; CHECK-UNORDERED-NEXT: [[TMP10]] = fadd nnan float [[TMP6]], [[VEC_PHI2]]
-; CHECK-UNORDERED-NEXT: [[TMP11]] = fadd nnan float [[TMP7]], [[VEC_PHI3]]
-; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
-; CHECK-UNORDERED: middle.block:
-; CHECK-UNORDERED-NEXT: [[BIN_RDX:%.*]] = fadd nnan float [[TMP9]], [[TMP8]]
-; CHECK-UNORDERED-NEXT: [[BIN_RDX7:%.*]] = fadd nnan float [[TMP10]], [[BIN_RDX]]
-; CHECK-UNORDERED-NEXT: [[BIN_RDX8:%.*]] = fadd nnan float [[TMP11]], [[BIN_RDX7]]
-; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-UNORDERED: scalar.ph:
-; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[BIN_RDX8]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED: for.body:
-; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT: [[ADD]] = fadd nnan float [[TMP13]], [[SUM_07]]
-; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
-; CHECK-UNORDERED: for.end:
-; CHECK-UNORDERED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[BIN_RDX8]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: ret float [[ADD_LCSSA]]
-;
-; CHECK-ORDERED-LABEL: @fadd_scalar_vf_fmf(
-; CHECK-ORDERED-NEXT: entry:
-; CHECK-ORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-ORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-ORDERED: vector.ph:
-; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-ORDERED: vector.body:
-; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[INDUCTION:%.*]] = add i64 [[INDEX]], 0
-; CHECK-ORDERED-NEXT: [[INDUCTION1:%.*]] = add i64 [[INDEX]], 1
-; CHECK-ORDERED-NEXT: [[INDUCTION2:%.*]] = add i64 [[INDEX]], 2
-; CHECK-ORDERED-NEXT: [[INDUCTION3:%.*]] = add i64 [[INDEX]], 3
-; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDUCTION]]
-; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION1]]
-; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION2]]
-; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION3]]
-; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = load float, float* [[TMP0]], align 4
-; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = load float, float* [[TMP1]], align 4
-; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = load float, float* [[TMP2]], align 4
-; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = load float, float* [[TMP3]], align 4
-; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = fadd nnan float [[VEC_PHI]], [[TMP4]]
-; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = fadd nnan float [[TMP8]], [[TMP5]]
-; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = fadd nnan float [[TMP9]], [[TMP6]]
-; CHECK-ORDERED-NEXT: [[TMP11]] = fadd nnan float [[TMP10]], [[TMP7]]
-; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-ORDERED-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
-; CHECK-ORDERED: middle.block:
-; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-ORDERED: scalar.ph:
-; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED: for.body:
-; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT: [[ADD]] = fadd nnan float [[TMP13]], [[SUM_07]]
-; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
-; CHECK-ORDERED: for.end:
-; CHECK-ORDERED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT: ret float [[ADD_LCSSA]]
-;
-
+; CHECK-ORDERED-LABEL: @fadd_scalar_vf_fmf
+; CHECK-ORDERED: vector.body:
+; CHECK-ORDERED: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[FADD4:%.*]], %vector.body ]
+; CHECK-ORDERED: [[LOAD1:%.*]] = load float, float*
+; CHECK-ORDERED: [[LOAD2:%.*]] = load float, float*
+; CHECK-ORDERED: [[LOAD3:%.*]] = load float, float*
+; CHECK-ORDERED: [[LOAD4:%.*]] = load float, float*
+; CHECK-ORDERED: [[FADD1:%.*]] = fadd nnan float [[VEC_PHI]], [[LOAD1]]
+; CHECK-ORDERED: [[FADD2:%.*]] = fadd nnan float [[FADD1]], [[LOAD2]]
+; CHECK-ORDERED: [[FADD3:%.*]] = fadd nnan float [[FADD2]], [[LOAD3]]
+; CHECK-ORDERED: [[FADD4]] = fadd nnan float [[FADD3]], [[LOAD4]]
+; CHECK-ORDERED-NOT: @llvm.vector.reduce.fadd
+; CHECK-ORDERED: scalar.ph:
+; CHECK-ORDERED: [[MERGE_RDX:%.*]] = phi float [ 0.000000e+00, %entry ], [ [[FADD4]], %middle.block ]
+; CHECK-ORDERED: for.body:
+; CHECK-ORDERED: [[SUM_07:%.*]] = phi float [ [[MERGE_RDX]], %scalar.ph ], [ [[FADD5:%.*]], %for.body ]
+; CHECK-ORDERED: [[LOAD5:%.*]] = load float, float*
+; CHECK-ORDERED: [[FADD5]] = fadd nnan float [[LOAD5]], [[SUM_07]]
+; CHECK-ORDERED: for.end:
+; CHECK-ORDERED: [[RES:%.*]] = phi float [ [[FADD5]], %for.body ], [ [[FADD4]], %middle.block ]
+; CHECK-ORDERED: ret float [[RES]]
+
+; CHECK-UNORDERED-LABEL: @fadd_scalar_vf_fmf
+; CHECK-UNORDERED: vector.body:
+; CHECK-UNORDERED: [[VEC_PHI1:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[FADD1:%.*]], %vector.body ]
+; CHECK-UNORDERED: [[VEC_PHI2:%.*]] = phi float [ -0.000000e+00, %vector.ph ], [ [[FADD2:%.*]], %vector.body ]
+; CHECK-UNORDERED: [[VEC_PHI3:%.*]] = phi float [ -0.000000e+00, %vector.ph ], [ [[FADD3:%.*]], %vector.body ]
+; CHECK-UNORDERED: [[VEC_PHI4:%.*]] = phi float [ -0.000000e+00, %vector.ph ], [ [[FADD4:%.*]], %vector.body ]
+; CHECK-UNORDERED: [[LOAD1:%.*]] = load float, float*
+; CHECK-UNORDERED: [[LOAD2:%.*]] = load float, float*
+; CHECK-UNORDERED: [[LOAD3:%.*]] = load float, float*
+; CHECK-UNORDERED: [[LOAD4:%.*]] = load float, float*
+; CHECK-UNORDERED: [[FADD1]] = fadd nnan float [[LOAD1]], [[VEC_PHI1]]
+; CHECK-UNORDERED: [[FADD2]] = fadd nnan float [[LOAD2]], [[VEC_PHI2]]
+; CHECK-UNORDERED: [[FADD3]] = fadd nnan float [[LOAD3]], [[VEC_PHI3]]
+; CHECK-UNORDERED: [[FADD4]] = fadd nnan float [[LOAD4]], [[VEC_PHI4]]
+; CHECK-UNORDERED-NOT: @llvm.vector.reduce.fadd
+; CHECK-UNORDERED: middle.block:
+; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd nnan float [[FADD2]], [[FADD1]]
+; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd nnan float [[FADD3]], [[BIN_RDX1]]
+; CHECK-UNORDERED: [[BIN_RDX3:%.*]] = fadd nnan float [[FADD4]], [[BIN_RDX2]]
+; CHECK-UNORDERED: scalar.ph:
+; CHECK-UNORDERED: [[MERGE_RDX:%.*]] = phi float [ 0.000000e+00, %entry ], [ [[BIN_RDX3]], %middle.block ]
+; CHECK-UNORDERED: for.body:
+; CHECK-UNORDERED: [[SUM_07:%.*]] = phi float [ [[MERGE_RDX]], %scalar.ph ], [ [[FADD5:%.*]], %for.body ]
+; CHECK-UNORDERED: [[LOAD5:%.*]] = load float, float*
+; CHECK-UNORDERED: [[FADD5]] = fadd nnan float [[LOAD5]], [[SUM_07]]
; CHECK-UORDERED: for.end
+; CHECK-UNORDERED: [[RES:%.*]] = phi float [ [[FADD5]], %for.body ], [ [[BIN_RDX3]], %middle.block ]
+; CHECK-UNORDERED: ret float [[RES]]
+; CHECK-NOT-VECTORIZED-LABEL: @fadd_scalar_vf_fmf
+; CHECK-NOT-VECTORIZED-NOT: vector.body
entry:
br label %for.body
; Test case where the reduction step is a first-order recurrence.
define double @reduction_increment_by_first_order_recurrence() {
-; CHECK-NOT-VECTORIZED-LABEL: @reduction_increment_by_first_order_recurrence(
-; CHECK-NOT-VECTORIZED-NEXT: entry:
-; CHECK-NOT-VECTORIZED-NEXT: br label [[LOOP:%.*]]
-; CHECK-NOT-VECTORIZED: loop:
-; CHECK-NOT-VECTORIZED-NEXT: [[RED:%.*]] = phi double [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[FOR:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[FOR_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[RED_NEXT]] = fadd double [[FOR]], [[RED]]
-; CHECK-NOT-VECTORIZED-NEXT: [[FOR_NEXT]] = sitofp i32 [[IV]] to double
-; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nsw i32 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 0
-; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK-NOT-VECTORIZED: exit:
-; CHECK-NOT-VECTORIZED-NEXT: [[RES:%.*]] = phi double [ [[RED_NEXT]], [[LOOP]] ]
-; CHECK-NOT-VECTORIZED-NEXT: ret double [[RES]]
+; CHECK-ORDERED-LABEL: @reduction_increment_by_first_order_recurrence(
+; CHECK-ORDERED: vector.body:
+; CHECK-ORDERED: [[RED:%.*]] = phi double [ 0.000000e+00, %vector.ph ], [ [[RED_NEXT:%.*]], %vector.body ]
+; CHECK-ORDERED: [[VECTOR_RECUR:%.*]] = phi <4 x double> [ <double poison, double poison, double poison, double 0.000000e+00>, %vector.ph ], [ [[FOR_NEXT:%.*]], %vector.body ]
+; CHECK-ORDERED: [[FOR_NEXT]] = sitofp <4 x i32> %vec.ind to <4 x double>
+; CHECK-ORDERED: [[TMP1:%.*]] = shufflevector <4 x double> [[VECTOR_RECUR]], <4 x double> [[FOR_NEXT]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-ORDERED: [[RED_NEXT]] = call double @llvm.vector.reduce.fadd.v4f64(double [[RED]], <4 x double> [[TMP1]])
+; CHECK-ORDERED: scalar.ph:
+; CHECK-ORDERED: = phi double [ 0.000000e+00, %entry ], [ [[RED_NEXT]], %middle.block ]
;
; CHECK-UNORDERED-LABEL: @reduction_increment_by_first_order_recurrence(
-; CHECK-UNORDERED-NEXT: entry:
-; CHECK-UNORDERED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-UNORDERED: vector.ph:
-; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-UNORDERED: vector.body:
-; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <4 x double> [ <double 0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x double> [ <double poison, double poison, double poison, double 0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP0:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[TMP0]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double>
-; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[VECTOR_RECUR]], <4 x double> [[TMP0]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-UNORDERED-NEXT: [[TMP2]] = fadd <4 x double> [[TMP1]], [[VEC_PHI]]
-; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-UNORDERED-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 0
-; CHECK-UNORDERED-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
-; CHECK-UNORDERED: middle.block:
-; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = call double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[TMP2]])
-; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i32 0, 0
-; CHECK-UNORDERED-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x double> [[TMP0]], i32 3
-; CHECK-UNORDERED-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x double> [[TMP0]], i32 2
-; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-UNORDERED: scalar.ph:
-; CHECK-UNORDERED-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi double [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: br label [[LOOP:%.*]]
-; CHECK-UNORDERED: loop:
-; CHECK-UNORDERED-NEXT: [[RED:%.*]] = phi double [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ]
-; CHECK-UNORDERED-NEXT: [[SCALAR_RECUR:%.*]] = phi double [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_NEXT:%.*]], [[LOOP]] ]
-; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-UNORDERED-NEXT: [[RED_NEXT]] = fadd double [[SCALAR_RECUR]], [[RED]]
-; CHECK-UNORDERED-NEXT: [[FOR_NEXT]] = sitofp i32 [[IV]] to double
-; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nsw i32 [[IV]], 1
-; CHECK-UNORDERED-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 0
-; CHECK-UNORDERED-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP29:![0-9]+]]
-; CHECK-UNORDERED: exit:
-; CHECK-UNORDERED-NEXT: [[RES:%.*]] = phi double [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: ret double [[RES]]
+; CHECK-UNORDERED: vector.body:
+; CHECK-UNORDERED: [[RED:%.*]] = phi <4 x double> [ <double 0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %vector.ph ], [ [[RED_NEXT:%.*]], %vector.body ]
+; CHECK-UNORDERED: [[VECTOR_RECUR:%.*]] = phi <4 x double> [ <double poison, double poison, double poison, double 0.000000e+00>, %vector.ph ], [ [[FOR_NEXT:%.*]], %vector.body ]
+; CHECK-UNORDERED: [[FOR_NEXT]] = sitofp <4 x i32> %vec.ind to <4 x double>
+; CHECK-UNORDERED: [[TMP1:%.*]] = shufflevector <4 x double> [[VECTOR_RECUR]], <4 x double> [[FOR_NEXT]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-UNORDERED: [[RED_NEXT]] = fadd <4 x double> [[TMP1]], [[RED]]
+; CHECK-UNORDERED: middle.block:
+; CHECK-UNORDERED: [[RDX:%.*]] = call double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[RED_NEXT]])
+; CHECK-UNORDERED: scalar.ph:
+; CHECK-UNORDERED: [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, %entry ], [ [[RDX]], %middle.block ]
;
-; CHECK-ORDERED-LABEL: @reduction_increment_by_first_order_recurrence(
-; CHECK-ORDERED-NEXT: entry:
-; CHECK-ORDERED-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-ORDERED: vector.ph:
-; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-ORDERED: vector.body:
-; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi double [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x double> [ <double poison, double poison, double poison, double 0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP0:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[TMP0]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double>
-; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[VECTOR_RECUR]], <4 x double> [[TMP0]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-ORDERED-NEXT: [[TMP2]] = call double @llvm.vector.reduce.fadd.v4f64(double [[VEC_PHI]], <4 x double> [[TMP1]])
-; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-ORDERED-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 0
-; CHECK-ORDERED-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
-; CHECK-ORDERED: middle.block:
-; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i32 0, 0
-; CHECK-ORDERED-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x double> [[TMP0]], i32 3
-; CHECK-ORDERED-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x double> [[TMP0]], i32 2
-; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-ORDERED: scalar.ph:
-; CHECK-ORDERED-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi double [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP2]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT: br label [[LOOP:%.*]]
-; CHECK-ORDERED: loop:
-; CHECK-ORDERED-NEXT: [[RED:%.*]] = phi double [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ]
-; CHECK-ORDERED-NEXT: [[SCALAR_RECUR:%.*]] = phi double [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_NEXT:%.*]], [[LOOP]] ]
-; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-ORDERED-NEXT: [[RED_NEXT]] = fadd double [[SCALAR_RECUR]], [[RED]]
-; CHECK-ORDERED-NEXT: [[FOR_NEXT]] = sitofp i32 [[IV]] to double
-; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nsw i32 [[IV]], 1
-; CHECK-ORDERED-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 0
-; CHECK-ORDERED-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP29:![0-9]+]]
-; CHECK-ORDERED: exit:
-; CHECK-ORDERED-NEXT: [[RES:%.*]] = phi double [ [[RED_NEXT]], [[LOOP]] ], [ [[TMP2]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT: ret double [[RES]]
+; CHECK-NOT-VECTORIZED-LABEL: @reduction_increment_by_first_order_recurrence(
+; CHECK-NOT-VECTORIZED-NOT: vector.body
;
entry:
br label %loop
; We should not mark the fadd as an ordered reduction here as there are
; more than 2 uses of the instruction
define float @fadd_multiple_use(i64 %n) {
-; CHECK-NOT-VECTORIZED-LABEL: @fadd_multiple_use(
-; CHECK-NOT-VECTORIZED-NEXT: entry:
-; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED: for.body:
-; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT2:%.*]], [[BB2:%.*]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[RED:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[FADD:%.*]], [[BB2]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[PHI1:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[BB2]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[FADD]] = fadd float [[RED]], 1.000000e+00
-; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nsw i64 [[PHI1]], 1
-; CHECK-NOT-VECTORIZED-NEXT: [[CMP:%.*]] = icmp ult i64 [[IV]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT: br i1 [[CMP]], label [[BB2]], label [[BB1:%.*]]
-; CHECK-NOT-VECTORIZED: bb1:
-; CHECK-NOT-VECTORIZED-NEXT: [[PHI2:%.*]] = phi float [ [[FADD]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: ret float [[PHI2]]
-; CHECK-NOT-VECTORIZED: bb2:
-; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT2]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT: br i1 false, label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK-NOT-VECTORIZED: for.end:
-; CHECK-NOT-VECTORIZED-NEXT: [[PHI3:%.*]] = phi float [ [[FADD]], [[BB2]] ]
-; CHECK-NOT-VECTORIZED-NEXT: ret float [[PHI3]]
-;
-; CHECK-UNORDERED-LABEL: @fadd_multiple_use(
-; CHECK-UNORDERED-NEXT: entry:
-; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED: for.body:
-; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT2:%.*]], [[BB2:%.*]] ]
-; CHECK-UNORDERED-NEXT: [[RED:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[FADD:%.*]], [[BB2]] ]
-; CHECK-UNORDERED-NEXT: [[PHI1:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[BB2]] ]
-; CHECK-UNORDERED-NEXT: [[FADD]] = fadd float [[RED]], 1.000000e+00
-; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nsw i64 [[PHI1]], 1
-; CHECK-UNORDERED-NEXT: [[CMP:%.*]] = icmp ult i64 [[IV]], [[N:%.*]]
-; CHECK-UNORDERED-NEXT: br i1 [[CMP]], label [[BB2]], label [[BB1:%.*]]
-; CHECK-UNORDERED: bb1:
-; CHECK-UNORDERED-NEXT: [[PHI2:%.*]] = phi float [ [[FADD]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: ret float [[PHI2]]
-; CHECK-UNORDERED: bb2:
-; CHECK-UNORDERED-NEXT: [[IV_NEXT2]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT: br i1 false, label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK-UNORDERED: for.end:
-; CHECK-UNORDERED-NEXT: [[PHI3:%.*]] = phi float [ [[FADD]], [[BB2]] ]
-; CHECK-UNORDERED-NEXT: ret float [[PHI3]]
-;
-; CHECK-ORDERED-LABEL: @fadd_multiple_use(
-; CHECK-ORDERED-NEXT: entry:
-; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED: for.body:
-; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT2:%.*]], [[BB2:%.*]] ]
-; CHECK-ORDERED-NEXT: [[RED:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[FADD:%.*]], [[BB2]] ]
-; CHECK-ORDERED-NEXT: [[PHI1:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[BB2]] ]
-; CHECK-ORDERED-NEXT: [[FADD]] = fadd float [[RED]], 1.000000e+00
-; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nsw i64 [[PHI1]], 1
-; CHECK-ORDERED-NEXT: [[CMP:%.*]] = icmp ult i64 [[IV]], [[N:%.*]]
-; CHECK-ORDERED-NEXT: br i1 [[CMP]], label [[BB2]], label [[BB1:%.*]]
-; CHECK-ORDERED: bb1:
-; CHECK-ORDERED-NEXT: [[PHI2:%.*]] = phi float [ [[FADD]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: ret float [[PHI2]]
-; CHECK-ORDERED: bb2:
-; CHECK-ORDERED-NEXT: [[IV_NEXT2]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT: br i1 false, label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK-ORDERED: for.end:
-; CHECK-ORDERED-NEXT: [[PHI3:%.*]] = phi float [ [[FADD]], [[BB2]] ]
-; CHECK-ORDERED-NEXT: ret float [[PHI3]]
-;
+; CHECK-ORDERED-LABEL: @fadd_multiple_use
; CHECK-ORDERED-LABEL-NOT: vector.body
+; CHECK-UNORDERED-LABEL: @fadd_multiple_use
; CHECK-UNORDERED-LABEL-NOT: vector.body
+; CHECK-NOT-VECTORIZED-LABEL: @fadd_multiple_use
+; CHECK-NOT-VECTORIZED-NOT: vector.body
entry:
br label %for.body
; Test case where the loop has a call to the llvm.fmuladd intrinsic.
define float @fmuladd_strict(float* %a, float* %b, i64 %n) {
-; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_strict(
-; CHECK-NOT-VECTORIZED-NEXT: entry:
-; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED: for.body:
-; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[SUM_07]])
-; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]]
-; CHECK-NOT-VECTORIZED: for.end:
-; CHECK-NOT-VECTORIZED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: ret float [[MULADD_LCSSA]]
-;
-; CHECK-UNORDERED-LABEL: @fmuladd_strict(
-; CHECK-UNORDERED-NEXT: entry:
-; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 32
-; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-UNORDERED: vector.ph:
-; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 32
-; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-UNORDERED: vector.body:
-; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI1:%.*]] = phi <8 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP29:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI2:%.*]] = phi <8 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI3:%.*]] = phi <8 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP31:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8
-; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 16
-; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 24
-; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
-; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]]
-; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP2]]
-; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP3]]
-; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
-; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP9]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 8
-; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to <8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x float>, <8 x float>* [[TMP11]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 16
-; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x float>, <8 x float>* [[TMP13]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 24
-; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x float>, <8 x float>* [[TMP15]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
-; CHECK-UNORDERED-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP1]]
-; CHECK-UNORDERED-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP2]]
-; CHECK-UNORDERED-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP3]]
-; CHECK-UNORDERED-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 0
-; CHECK-UNORDERED-NEXT: [[TMP21:%.*]] = bitcast float* [[TMP20]] to <8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x float>, <8 x float>* [[TMP21]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 8
-; CHECK-UNORDERED-NEXT: [[TMP23:%.*]] = bitcast float* [[TMP22]] to <8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD8:%.*]] = load <8 x float>, <8 x float>* [[TMP23]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 16
-; CHECK-UNORDERED-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to <8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD9:%.*]] = load <8 x float>, <8 x float>* [[TMP25]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 24
-; CHECK-UNORDERED-NEXT: [[TMP27:%.*]] = bitcast float* [[TMP26]] to <8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD10:%.*]] = load <8 x float>, <8 x float>* [[TMP27]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP28]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD]], <8 x float> [[WIDE_LOAD7]], <8 x float> [[VEC_PHI]])
-; CHECK-UNORDERED-NEXT: [[TMP29]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD4]], <8 x float> [[WIDE_LOAD8]], <8 x float> [[VEC_PHI1]])
-; CHECK-UNORDERED-NEXT: [[TMP30]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD5]], <8 x float> [[WIDE_LOAD9]], <8 x float> [[VEC_PHI2]])
-; CHECK-UNORDERED-NEXT: [[TMP31]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD6]], <8 x float> [[WIDE_LOAD10]], <8 x float> [[VEC_PHI3]])
-; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; CHECK-UNORDERED-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
-; CHECK-UNORDERED: middle.block:
-; CHECK-UNORDERED-NEXT: [[BIN_RDX:%.*]] = fadd <8 x float> [[TMP29]], [[TMP28]]
-; CHECK-UNORDERED-NEXT: [[BIN_RDX11:%.*]] = fadd <8 x float> [[TMP30]], [[BIN_RDX]]
-; CHECK-UNORDERED-NEXT: [[BIN_RDX12:%.*]] = fadd <8 x float> [[TMP31]], [[BIN_RDX11]]
-; CHECK-UNORDERED-NEXT: [[TMP33:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[BIN_RDX12]])
-; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-UNORDERED: scalar.ph:
-; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED: for.body:
-; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT: [[TMP34:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT: [[TMP35:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-UNORDERED-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP34]], float [[TMP35]], float [[SUM_07]])
-; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]]
-; CHECK-UNORDERED: for.end:
-; CHECK-UNORDERED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: ret float [[MULADD_LCSSA]]
-;
-; CHECK-ORDERED-LABEL: @fmuladd_strict(
-; CHECK-ORDERED-NEXT: entry:
-; CHECK-ORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 32
-; CHECK-ORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-ORDERED: vector.ph:
-; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 32
-; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-ORDERED: vector.body:
-; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP35:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8
-; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 16
-; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 24
-; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
-; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]]
-; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP2]]
-; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP3]]
-; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
-; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <8 x float>*
-; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP9]], align 4
-; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 8
-; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to <8 x float>*
-; CHECK-ORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP11]], align 4
-; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 16
-; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <8 x float>*
-; CHECK-ORDERED-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x float>, <8 x float>* [[TMP13]], align 4
-; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 24
-; CHECK-ORDERED-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <8 x float>*
-; CHECK-ORDERED-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x float>, <8 x float>* [[TMP15]], align 4
-; CHECK-ORDERED-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
-; CHECK-ORDERED-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP1]]
-; CHECK-ORDERED-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP2]]
-; CHECK-ORDERED-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP3]]
-; CHECK-ORDERED-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 0
-; CHECK-ORDERED-NEXT: [[TMP21:%.*]] = bitcast float* [[TMP20]] to <8 x float>*
-; CHECK-ORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x float>, <8 x float>* [[TMP21]], align 4
-; CHECK-ORDERED-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 8
-; CHECK-ORDERED-NEXT: [[TMP23:%.*]] = bitcast float* [[TMP22]] to <8 x float>*
-; CHECK-ORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x float>, <8 x float>* [[TMP23]], align 4
-; CHECK-ORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 16
-; CHECK-ORDERED-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to <8 x float>*
-; CHECK-ORDERED-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x float>, <8 x float>* [[TMP25]], align 4
-; CHECK-ORDERED-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 24
-; CHECK-ORDERED-NEXT: [[TMP27:%.*]] = bitcast float* [[TMP26]] to <8 x float>*
-; CHECK-ORDERED-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x float>, <8 x float>* [[TMP27]], align 4
-; CHECK-ORDERED-NEXT: [[TMP28:%.*]] = fmul <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD4]]
-; CHECK-ORDERED-NEXT: [[TMP29:%.*]] = fmul <8 x float> [[WIDE_LOAD1]], [[WIDE_LOAD5]]
-; CHECK-ORDERED-NEXT: [[TMP30:%.*]] = fmul <8 x float> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
-; CHECK-ORDERED-NEXT: [[TMP31:%.*]] = fmul <8 x float> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
-; CHECK-ORDERED-NEXT: [[TMP32:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[VEC_PHI]], <8 x float> [[TMP28]])
-; CHECK-ORDERED-NEXT: [[TMP33:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[TMP32]], <8 x float> [[TMP29]])
-; CHECK-ORDERED-NEXT: [[TMP34:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[TMP33]], <8 x float> [[TMP30]])
-; CHECK-ORDERED-NEXT: [[TMP35]] = call float @llvm.vector.reduce.fadd.v8f32(float [[TMP34]], <8 x float> [[TMP31]])
-; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; CHECK-ORDERED-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-ORDERED-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
-; CHECK-ORDERED: middle.block:
-; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-ORDERED: scalar.ph:
-; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP35]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED: for.body:
-; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-ORDERED-NEXT: [[TMP37:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]]
-; CHECK-ORDERED-NEXT: [[TMP38:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-ORDERED-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP37]], float [[TMP38]], float [[SUM_07]])
-; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]]
-; CHECK-ORDERED: for.end:
-; CHECK-ORDERED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP35]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT: ret float [[MULADD_LCSSA]]
-;
-
-
+; CHECK-ORDERED-LABEL: @fmuladd_strict
+; CHECK-ORDERED: vector.body:
+; CHECK-ORDERED: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[RDX3:%.*]], %vector.body ]
+; CHECK-ORDERED: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD2:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD3:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD4:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD5:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD6:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD7:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-ORDERED: [[FMUL:%.*]] = fmul <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD4]]
+; CHECK-ORDERED: [[FMUL1:%.*]] = fmul <8 x float> [[WIDE_LOAD1]], [[WIDE_LOAD5]]
+; CHECK-ORDERED: [[FMUL2:%.*]] = fmul <8 x float> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
+; CHECK-ORDERED: [[FMUL3:%.*]] = fmul <8 x float> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
+; CHECK-ORDERED: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[VEC_PHI]], <8 x float> [[FMUL]])
+; CHECK-ORDERED: [[RDX1:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[RDX]], <8 x float> [[FMUL1]])
+; CHECK-ORDERED: [[RDX2:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float [[RDX1]], <8 x float> [[FMUL2]])
+; CHECK-ORDERED: [[RDX3]] = call float @llvm.vector.reduce.fadd.v8f32(float [[RDX2]], <8 x float> [[FMUL3]])
+; CHECK-ORDERED: for.body:
+; CHECK-ORDERED: [[SUM_07:%.*]] = phi float [ {{.*}}, %scalar.ph ], [ [[MULADD:%.*]], %for.body ]
+; CHECK-ORDERED: [[LOAD:%.*]] = load float, float*
+; CHECK-ORDERED: [[LOAD1:%.*]] = load float, float*
+; CHECK-ORDERED: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD1]], float [[SUM_07]])
+; CHECK-ORDERED: for.end
+; CHECK-ORDERED: [[RES:%.*]] = phi float [ [[MULADD]], %for.body ], [ [[RDX3]], %middle.block ]
+
+; CHECK-UNORDERED-LABEL: @fmuladd_strict
+; CHECK-UNORDERED: vector.body:
+; CHECK-UNORDERED: [[VEC_PHI:%.*]] = phi <8 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ [[FMULADD:%.*]], %vector.body ]
+; CHECK-UNORDERED: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD2:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD3:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD4:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-UNORDERED: [[FMULADD]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD]], <8 x float> [[WIDE_LOAD4]], <8 x float> [[VEC_PHI]])
+; CHECK-UNORDERED-NOT: llvm.vector.reduce.fadd
+; CHECK-UNORDERED: middle.block:
+; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd <8 x float>
+; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd <8 x float>
+; CHECK-UNORDERED: [[BIN_RDX3:%.*]] = fadd <8 x float>
+; CHECK-UNORDERED: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[BIN_RDX3]])
+; CHECK-UNORDERED: for.body:
+; CHECK-UNORDERED: [[SUM_07:%.*]] = phi float [ {{.*}}, %scalar.ph ], [ [[MULADD:%.*]], %for.body ]
+; CHECK-UNORDERED: [[LOAD:%.*]] = load float, float*
+; CHECK-UNORDERED: [[LOAD2:%.*]] = load float, float*
+; CHECK-UNORDERED: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD2]], float [[SUM_07]])
+; CHECK-UNORDERED: for.end:
+; CHECK-UNORDERED: [[RES:%.*]] = phi float [ [[MULADD]], %for.body ], [ [[RDX]], %middle.block ]
+; CHECK-UNORDERED: ret float [[RES]]
+
+; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_strict
+; CHECK-NOT-VECTORIZED-NOT: vector.body
entry:
br label %for.body
; Test reductions for a VF of 1 and a UF > 1 where the loop has a call to the llvm.fmuladd intrinsic.
define float @fmuladd_scalar_vf(float* %a, float* %b, i64 %n) {
-; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_scalar_vf(
-; CHECK-NOT-VECTORIZED-NEXT: entry:
-; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED: for.body:
-; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[SUM_07]])
-; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP11]]
-; CHECK-NOT-VECTORIZED: for.end:
-; CHECK-NOT-VECTORIZED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: ret float [[MULADD_LCSSA]]
-;
-; CHECK-UNORDERED-LABEL: @fmuladd_scalar_vf(
-; CHECK-UNORDERED-NEXT: entry:
-; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-UNORDERED: vector.ph:
-; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-UNORDERED: vector.body:
-; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI1:%.*]] = phi float [ -0.000000e+00, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI2:%.*]] = phi float [ -0.000000e+00, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI3:%.*]] = phi float [ -0.000000e+00, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[INDUCTION:%.*]] = add i64 [[INDEX]], 0
-; CHECK-UNORDERED-NEXT: [[INDUCTION4:%.*]] = add i64 [[INDEX]], 1
-; CHECK-UNORDERED-NEXT: [[INDUCTION5:%.*]] = add i64 [[INDEX]], 2
-; CHECK-UNORDERED-NEXT: [[INDUCTION6:%.*]] = add i64 [[INDEX]], 3
-; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDUCTION]]
-; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION4]]
-; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION5]]
-; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION6]]
-; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = load float, float* [[TMP0]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = load float, float* [[TMP1]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = load float, float* [[TMP2]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = load float, float* [[TMP3]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[INDUCTION]]
-; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDUCTION4]]
-; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDUCTION5]]
-; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDUCTION6]]
-; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = load float, float* [[TMP8]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = load float, float* [[TMP9]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = load float, float* [[TMP10]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = load float, float* [[TMP11]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP16]] = call float @llvm.fmuladd.f32(float [[TMP4]], float [[TMP12]], float [[VEC_PHI]])
-; CHECK-UNORDERED-NEXT: [[TMP17]] = call float @llvm.fmuladd.f32(float [[TMP5]], float [[TMP13]], float [[VEC_PHI1]])
-; CHECK-UNORDERED-NEXT: [[TMP18]] = call float @llvm.fmuladd.f32(float [[TMP6]], float [[TMP14]], float [[VEC_PHI2]])
-; CHECK-UNORDERED-NEXT: [[TMP19]] = call float @llvm.fmuladd.f32(float [[TMP7]], float [[TMP15]], float [[VEC_PHI3]])
-; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-UNORDERED-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
-; CHECK-UNORDERED: middle.block:
-; CHECK-UNORDERED-NEXT: [[BIN_RDX:%.*]] = fadd float [[TMP17]], [[TMP16]]
-; CHECK-UNORDERED-NEXT: [[BIN_RDX7:%.*]] = fadd float [[TMP18]], [[BIN_RDX]]
-; CHECK-UNORDERED-NEXT: [[BIN_RDX8:%.*]] = fadd float [[TMP19]], [[BIN_RDX7]]
-; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-UNORDERED: scalar.ph:
-; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[BIN_RDX8]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED: for.body:
-; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT: [[TMP21:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT: [[TMP22:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-UNORDERED-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP21]], float [[TMP22]], float [[SUM_07]])
-; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]]
-; CHECK-UNORDERED: for.end:
-; CHECK-UNORDERED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[BIN_RDX8]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: ret float [[MULADD_LCSSA]]
-;
-; CHECK-ORDERED-LABEL: @fmuladd_scalar_vf(
-; CHECK-ORDERED-NEXT: entry:
-; CHECK-ORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-ORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-ORDERED: vector.ph:
-; CHECK-ORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-ORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-ORDERED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-ORDERED: vector.body:
-; CHECK-ORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[INDUCTION:%.*]] = add i64 [[INDEX]], 0
-; CHECK-ORDERED-NEXT: [[INDUCTION1:%.*]] = add i64 [[INDEX]], 1
-; CHECK-ORDERED-NEXT: [[INDUCTION2:%.*]] = add i64 [[INDEX]], 2
-; CHECK-ORDERED-NEXT: [[INDUCTION3:%.*]] = add i64 [[INDEX]], 3
-; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDUCTION]]
-; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION1]]
-; CHECK-ORDERED-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION2]]
-; CHECK-ORDERED-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION3]]
-; CHECK-ORDERED-NEXT: [[TMP4:%.*]] = load float, float* [[TMP0]], align 4
-; CHECK-ORDERED-NEXT: [[TMP5:%.*]] = load float, float* [[TMP1]], align 4
-; CHECK-ORDERED-NEXT: [[TMP6:%.*]] = load float, float* [[TMP2]], align 4
-; CHECK-ORDERED-NEXT: [[TMP7:%.*]] = load float, float* [[TMP3]], align 4
-; CHECK-ORDERED-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[INDUCTION]]
-; CHECK-ORDERED-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDUCTION1]]
-; CHECK-ORDERED-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDUCTION2]]
-; CHECK-ORDERED-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDUCTION3]]
-; CHECK-ORDERED-NEXT: [[TMP12:%.*]] = load float, float* [[TMP8]], align 4
-; CHECK-ORDERED-NEXT: [[TMP13:%.*]] = load float, float* [[TMP9]], align 4
-; CHECK-ORDERED-NEXT: [[TMP14:%.*]] = load float, float* [[TMP10]], align 4
-; CHECK-ORDERED-NEXT: [[TMP15:%.*]] = load float, float* [[TMP11]], align 4
-; CHECK-ORDERED-NEXT: [[TMP16:%.*]] = fmul float [[TMP4]], [[TMP12]]
-; CHECK-ORDERED-NEXT: [[TMP17:%.*]] = fmul float [[TMP5]], [[TMP13]]
-; CHECK-ORDERED-NEXT: [[TMP18:%.*]] = fmul float [[TMP6]], [[TMP14]]
-; CHECK-ORDERED-NEXT: [[TMP19:%.*]] = fmul float [[TMP7]], [[TMP15]]
-; CHECK-ORDERED-NEXT: [[TMP20:%.*]] = fadd float [[VEC_PHI]], [[TMP16]]
-; CHECK-ORDERED-NEXT: [[TMP21:%.*]] = fadd float [[TMP20]], [[TMP17]]
-; CHECK-ORDERED-NEXT: [[TMP22:%.*]] = fadd float [[TMP21]], [[TMP18]]
-; CHECK-ORDERED-NEXT: [[TMP23]] = fadd float [[TMP22]], [[TMP19]]
-; CHECK-ORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-ORDERED-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-ORDERED-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
-; CHECK-ORDERED: middle.block:
-; CHECK-ORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-ORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-ORDERED: scalar.ph:
-; CHECK-ORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-ORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED: for.body:
-; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-ORDERED-NEXT: [[TMP25:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]]
-; CHECK-ORDERED-NEXT: [[TMP26:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-ORDERED-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[TMP25]], float [[TMP26]], float [[SUM_07]])
-; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]]
-; CHECK-ORDERED: for.end:
-; CHECK-ORDERED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-ORDERED-NEXT: ret float [[MULADD_LCSSA]]
-;
-
-
+; CHECK-ORDERED-LABEL: @fmuladd_scalar_vf
+; CHECK-ORDERED: vector.body:
+; CHECK-ORDERED: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[FADD3:%.*]], %vector.body ]
+; CHECK-ORDERED: [[LOAD:%.*]] = load float, float*
+; CHECK-ORDERED: [[LOAD1:%.*]] = load float, float*
+; CHECK-ORDERED: [[LOAD2:%.*]] = load float, float*
+; CHECK-ORDERED: [[LOAD3:%.*]] = load float, float*
+; CHECK-ORDERED: [[LOAD4:%.*]] = load float, float*
+; CHECK-ORDERED: [[LOAD5:%.*]] = load float, float*
+; CHECK-ORDERED: [[LOAD6:%.*]] = load float, float*
+; CHECK-ORDERED: [[LOAD7:%.*]] = load float, float*
+; CHECK-ORDERED: [[FMUL:%.*]] = fmul float [[LOAD]], [[LOAD4]]
+; CHECK-ORDERED: [[FMUL1:%.*]] = fmul float [[LOAD1]], [[LOAD5]]
+; CHECK-ORDERED: [[FMUL2:%.*]] = fmul float [[LOAD2]], [[LOAD6]]
+; CHECK-ORDERED: [[FMUL3:%.*]] = fmul float [[LOAD3]], [[LOAD7]]
+; CHECK-ORDERED: [[FADD:%.*]] = fadd float [[VEC_PHI]], [[FMUL]]
+; CHECK-ORDERED: [[FADD1:%.*]] = fadd float [[FADD]], [[FMUL1]]
+; CHECK-ORDERED: [[FADD2:%.*]] = fadd float [[FADD1]], [[FMUL2]]
+; CHECK-ORDERED: [[FADD3]] = fadd float [[FADD2]], [[FMUL3]]
+; CHECK-ORDERED-NOT: llvm.vector.reduce.fadd
+; CHECK-ORDERED: scalar.ph
+; CHECK-ORDERED: [[MERGE_RDX:%.*]] = phi float [ 0.000000e+00, %entry ], [ [[FADD3]], %middle.block ]
+; CHECK-ORDERED: for.body
+; CHECK-ORDERED: [[SUM_07:%.*]] = phi float [ [[MERGE_RDX]], %scalar.ph ], [ [[MULADD:%.*]], %for.body ]
+; CHECK-ORDERED: [[LOAD8:%.*]] = load float, float*
+; CHECK-ORDERED: [[LOAD9:%.*]] = load float, float*
+; CHECK-ORDERED: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[LOAD8]], float [[LOAD9]], float [[SUM_07]])
+; CHECK-ORDERED: for.end
+; CHECK-ORDERED: [[RES:%.*]] = phi float [ [[MULADD]], %for.body ], [ [[FADD3]], %middle.block ]
+; CHECK-ORDERED: ret float [[RES]]
+
+; CHECK-UNORDERED-LABEL: @fmuladd_scalar_vf
+; CHECK-UNORDERED: vector.body:
+; CHECK-UNORDERED: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[FMULADD:%.*]], %vector.body ]
+; CHECK-UNORDERED: [[VEC_PHI1:%.*]] = phi float [ -0.000000e+00, %vector.ph ], [ [[FMULADD1:%.*]], %vector.body ]
+; CHECK-UNORDERED: [[VEC_PHI2:%.*]] = phi float [ -0.000000e+00, %vector.ph ], [ [[FMULADD2:%.*]], %vector.body ]
+; CHECK-UNORDERED: [[VEC_PHI3:%.*]] = phi float [ -0.000000e+00, %vector.ph ], [ [[FMULADD3:%.*]], %vector.body ]
+; CHECK-UNORDERED: [[LOAD:%.*]] = load float, float*
+; CHECK-UNORDERED: [[LOAD1:%.*]] = load float, float*
+; CHECK-UNORDERED: [[LOAD2:%.*]] = load float, float*
+; CHECK-UNORDERED: [[LOAD3:%.*]] = load float, float*
+; CHECK-UNORDERED: [[LOAD4:%.*]] = load float, float*
+; CHECK-UNORDERED: [[LOAD5:%.*]] = load float, float*
+; CHECK-UNORDERED: [[LOAD6:%.*]] = load float, float*
+; CHECK-UNORDERED: [[LOAD7:%.*]] = load float, float*
+; CHECK-UNORDERED: [[FMULADD]] = call float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD4]], float [[VEC_PHI]])
+; CHECK-UNORDERED: [[FMULADD1]] = call float @llvm.fmuladd.f32(float [[LOAD1]], float [[LOAD5]], float [[VEC_PHI1]])
+; CHECK-UNORDERED: [[FMULADD2]] = call float @llvm.fmuladd.f32(float [[LOAD2]], float [[LOAD6]], float [[VEC_PHI2]])
+; CHECK-UNORDERED: [[FMULADD3]] = call float @llvm.fmuladd.f32(float [[LOAD3]], float [[LOAD7]], float [[VEC_PHI3]])
+; CHECK-UNORDERED-NOT: llvm.vector.reduce.fadd
+; CHECK-UNORDERED: middle.block:
+; CHECK-UNORDERED: [[BIN_RDX:%.*]] = fadd float [[FMULADD1]], [[FMULADD]]
+; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd float [[FMULADD2]], [[BIN_RDX]]
+; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd float [[FMULADD3]], [[BIN_RDX1]]
+; CHECK-UNORDERED: scalar.ph:
+; CHECK-UNORDERED: [[MERGE_RDX:%.*]] = phi float [ 0.000000e+00, %entry ], [ [[BIN_RDX2]], %middle.block ]
+; CHECK-UNORDERED: for.body:
+; CHECK-UNORDERED: [[SUM_07:%.*]] = phi float [ [[MERGE_RDX]], %scalar.ph ], [ [[MULADD:%.*]], %for.body ]
+; CHECK-UNORDERED: [[LOAD8:%.*]] = load float, float*
+; CHECK-UNORDERED: [[LOAD9:%.*]] = load float, float*
+; CHECK-UNORDERED: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[LOAD8]], float [[LOAD9]], float [[SUM_07]])
+; CHECK-UNORDERED: for.end:
+; CHECK-UNORDERED: [[RES:%.*]] = phi float [ [[MULADD]], %for.body ], [ [[BIN_RDX2]], %middle.block ]
+; CHECK-UNORDERED: ret float [[RES]]
+
+; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_scalar_vf
+; CHECK-NOT-VECTORIZED-NOT: vector.body
entry:
br label %for.body
; Test case where the reduction phi is one of the mul operands of the fmuladd.
define float @fmuladd_phi_is_mul_operand(float* %a, float* %b, i64 %n) {
-; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_phi_is_mul_operand(
-; CHECK-NOT-VECTORIZED-NEXT: entry:
-; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED: for.body:
-; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[SUM_07]], float [[TMP0]], float [[TMP1]])
-; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]]
-; CHECK-NOT-VECTORIZED: for.end:
-; CHECK-NOT-VECTORIZED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: ret float [[MULADD_LCSSA]]
-;
-; CHECK-UNORDERED-LABEL: @fmuladd_phi_is_mul_operand(
-; CHECK-UNORDERED-NEXT: entry:
-; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED: for.body:
-; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-UNORDERED-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[SUM_07]], float [[TMP0]], float [[TMP1]])
-; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
-; CHECK-UNORDERED: for.end:
-; CHECK-UNORDERED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: ret float [[MULADD_LCSSA]]
-;
-; CHECK-ORDERED-LABEL: @fmuladd_phi_is_mul_operand(
-; CHECK-ORDERED-NEXT: entry:
-; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED: for.body:
-; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-ORDERED-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[SUM_07]], float [[TMP0]], float [[TMP1]])
-; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
-; CHECK-ORDERED: for.end:
-; CHECK-ORDERED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: ret float [[MULADD_LCSSA]]
-;
+; CHECK-ORDERED-LABEL: @fmuladd_phi_is_mul_operand
+; CHECK-ORDERED-NOT: vector.body
+; CHECK-UNORDERED-LABEL: @fmuladd_phi_is_mul_operand
+; CHECK-UNORDERED-NOT: vector.body
+; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_phi_is_mul_operand
+; CHECK-NOT-VECTORIZED-NOT: vector.body
entry:
br label %for.body
; Test case where the reduction phi is two operands of the fmuladd.
define float @fmuladd_phi_is_two_operands(float* %a, i64 %n) {
-; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_phi_is_two_operands(
-; CHECK-NOT-VECTORIZED-NEXT: entry:
-; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED: for.body:
-; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[SUM_07]], float [[TMP0]], float [[SUM_07]])
-; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]]
-; CHECK-NOT-VECTORIZED: for.end:
-; CHECK-NOT-VECTORIZED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: ret float [[MULADD_LCSSA]]
-;
-; CHECK-UNORDERED-LABEL: @fmuladd_phi_is_two_operands(
-; CHECK-UNORDERED-NEXT: entry:
-; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED: for.body:
-; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[SUM_07]], float [[TMP0]], float [[SUM_07]])
-; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP34]]
-; CHECK-UNORDERED: for.end:
-; CHECK-UNORDERED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: ret float [[MULADD_LCSSA]]
-;
-; CHECK-ORDERED-LABEL: @fmuladd_phi_is_two_operands(
-; CHECK-ORDERED-NEXT: entry:
-; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED: for.body:
-; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT: [[MULADD]] = tail call float @llvm.fmuladd.f32(float [[SUM_07]], float [[TMP0]], float [[SUM_07]])
-; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP34]]
-; CHECK-ORDERED: for.end:
-; CHECK-ORDERED-NEXT: [[MULADD_LCSSA:%.*]] = phi float [ [[MULADD]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: ret float [[MULADD_LCSSA]]
-;
+; CHECK-ORDERED-LABEL: @fmuladd_phi_is_two_operands
+; CHECK-ORDERED-NOT: vector.body
+; CHECK-UNORDERED-LABEL: @fmuladd_phi_is_two_operands
+; CHECK-UNORDERED-NOT: vector.body
+; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_phi_is_two_operands
+; CHECK-NOT-VECTORIZED-NOT: vector.body
entry:
br label %for.body
; Test case with multiple calls to llvm.fmuladd, which is not safe to reorder
; so is only vectorized in the unordered (fast) case.
define float @fmuladd_multiple(float* %a, float* %b, i64 %n) {
-; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_multiple(
-; CHECK-NOT-VECTORIZED-NEXT: entry:
-; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED: for.body:
-; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD2:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[MULADD:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[SUM_07]])
-; CHECK-NOT-VECTORIZED-NEXT: [[MULADD2]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[MULADD]])
-; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]]
-; CHECK-NOT-VECTORIZED: for.end:
-; CHECK-NOT-VECTORIZED-NEXT: [[MULADD2_LCSSA:%.*]] = phi float [ [[MULADD2]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: ret float [[MULADD2_LCSSA]]
-;
-; CHECK-UNORDERED-LABEL: @fmuladd_multiple(
-; CHECK-UNORDERED-NEXT: entry:
-; CHECK-UNORDERED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 32
-; CHECK-UNORDERED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-UNORDERED: vector.ph:
-; CHECK-UNORDERED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 32
-; CHECK-UNORDERED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-UNORDERED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-UNORDERED: vector.body:
-; CHECK-UNORDERED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP32:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI1:%.*]] = phi <8 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP33:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI2:%.*]] = phi <8 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP34:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[VEC_PHI3:%.*]] = phi <8 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP35:%.*]], [[VECTOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8
-; CHECK-UNORDERED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 16
-; CHECK-UNORDERED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 24
-; CHECK-UNORDERED-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
-; CHECK-UNORDERED-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]]
-; CHECK-UNORDERED-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP2]]
-; CHECK-UNORDERED-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP3]]
-; CHECK-UNORDERED-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
-; CHECK-UNORDERED-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP9]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 8
-; CHECK-UNORDERED-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to <8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x float>, <8 x float>* [[TMP11]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 16
-; CHECK-UNORDERED-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x float>, <8 x float>* [[TMP13]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 24
-; CHECK-UNORDERED-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x float>, <8 x float>* [[TMP15]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
-; CHECK-UNORDERED-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP1]]
-; CHECK-UNORDERED-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP2]]
-; CHECK-UNORDERED-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP3]]
-; CHECK-UNORDERED-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 0
-; CHECK-UNORDERED-NEXT: [[TMP21:%.*]] = bitcast float* [[TMP20]] to <8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x float>, <8 x float>* [[TMP21]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 8
-; CHECK-UNORDERED-NEXT: [[TMP23:%.*]] = bitcast float* [[TMP22]] to <8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD8:%.*]] = load <8 x float>, <8 x float>* [[TMP23]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 16
-; CHECK-UNORDERED-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to <8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD9:%.*]] = load <8 x float>, <8 x float>* [[TMP25]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 24
-; CHECK-UNORDERED-NEXT: [[TMP27:%.*]] = bitcast float* [[TMP26]] to <8 x float>*
-; CHECK-UNORDERED-NEXT: [[WIDE_LOAD10:%.*]] = load <8 x float>, <8 x float>* [[TMP27]], align 4
-; CHECK-UNORDERED-NEXT: [[TMP28:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD]], <8 x float> [[WIDE_LOAD7]], <8 x float> [[VEC_PHI]])
-; CHECK-UNORDERED-NEXT: [[TMP29:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD4]], <8 x float> [[WIDE_LOAD8]], <8 x float> [[VEC_PHI1]])
-; CHECK-UNORDERED-NEXT: [[TMP30:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD5]], <8 x float> [[WIDE_LOAD9]], <8 x float> [[VEC_PHI2]])
-; CHECK-UNORDERED-NEXT: [[TMP31:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD6]], <8 x float> [[WIDE_LOAD10]], <8 x float> [[VEC_PHI3]])
-; CHECK-UNORDERED-NEXT: [[TMP32]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD]], <8 x float> [[WIDE_LOAD7]], <8 x float> [[TMP28]])
-; CHECK-UNORDERED-NEXT: [[TMP33]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD4]], <8 x float> [[WIDE_LOAD8]], <8 x float> [[TMP29]])
-; CHECK-UNORDERED-NEXT: [[TMP34]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD5]], <8 x float> [[WIDE_LOAD9]], <8 x float> [[TMP30]])
-; CHECK-UNORDERED-NEXT: [[TMP35]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD6]], <8 x float> [[WIDE_LOAD10]], <8 x float> [[TMP31]])
-; CHECK-UNORDERED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; CHECK-UNORDERED-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
-; CHECK-UNORDERED: middle.block:
-; CHECK-UNORDERED-NEXT: [[BIN_RDX:%.*]] = fadd <8 x float> [[TMP33]], [[TMP32]]
-; CHECK-UNORDERED-NEXT: [[BIN_RDX11:%.*]] = fadd <8 x float> [[TMP34]], [[BIN_RDX]]
-; CHECK-UNORDERED-NEXT: [[BIN_RDX12:%.*]] = fadd <8 x float> [[TMP35]], [[BIN_RDX11]]
-; CHECK-UNORDERED-NEXT: [[TMP37:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[BIN_RDX12]])
-; CHECK-UNORDERED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-UNORDERED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK-UNORDERED: scalar.ph:
-; CHECK-UNORDERED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-UNORDERED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP37]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED: for.body:
-; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MULADD2:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT: [[TMP38:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT: [[TMP39:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-UNORDERED-NEXT: [[MULADD:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP38]], float [[TMP39]], float [[SUM_07]])
-; CHECK-UNORDERED-NEXT: [[MULADD2]] = tail call float @llvm.fmuladd.f32(float [[TMP38]], float [[TMP39]], float [[MULADD]])
-; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP39:![0-9]+]]
-; CHECK-UNORDERED: for.end:
-; CHECK-UNORDERED-NEXT: [[MULADD2_LCSSA:%.*]] = phi float [ [[MULADD2]], [[FOR_BODY]] ], [ [[TMP37]], [[MIDDLE_BLOCK]] ]
-; CHECK-UNORDERED-NEXT: ret float [[MULADD2_LCSSA]]
-;
-; CHECK-ORDERED-LABEL: @fmuladd_multiple(
-; CHECK-ORDERED-NEXT: entry:
-; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED: for.body:
-; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD2:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-ORDERED-NEXT: [[MULADD:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[SUM_07]])
-; CHECK-ORDERED-NEXT: [[MULADD2]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[MULADD]])
-; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP34]]
-; CHECK-ORDERED: for.end:
-; CHECK-ORDERED-NEXT: [[MULADD2_LCSSA:%.*]] = phi float [ [[MULADD2]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: ret float [[MULADD2_LCSSA]]
-;
-
-
+; CHECK-ORDERED-LABEL: @fmuladd_multiple
+; CHECK-ORDERED-NOT: vector.body:
+
+; CHECK-UNORDERED-LABEL: @fmuladd_multiple
+; CHECK-UNORDERED: vector.body:
+; CHECK-UNORDERED: [[VEC_PHI:%.*]] = phi <8 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ [[FMULADD2:%.*]], %vector.body ]
+; CHECK-UNORDERED: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD2:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD3:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD4:%.*]] = load <8 x float>, <8 x float>*
+; CHECK-UNORDERED: [[FMULADD:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD]], <8 x float> [[WIDE_LOAD4]], <8 x float> [[VEC_PHI]])
+; CHECK-UNORDERED: [[FMULADD2]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[WIDE_LOAD]], <8 x float> [[WIDE_LOAD4]], <8 x float> [[FMULADD]])
+; CHECK-UNORDERED-NOT: llvm.vector.reduce.fadd
+; CHECK-UNORDERED: middle.block:
+; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd <8 x float>
+; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd <8 x float>
+; CHECK-UNORDERED: [[BIN_RDX3:%.*]] = fadd <8 x float>
+; CHECK-UNORDERED: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[BIN_RDX3]])
+; CHECK-UNORDERED: for.body:
+; CHECK-UNORDERED: [[SUM_07:%.*]] = phi float [ {{.*}}, %scalar.ph ], [ [[MULADD2:%.*]], %for.body ]
+; CHECK-UNORDERED: [[LOAD:%.*]] = load float, float*
+; CHECK-UNORDERED: [[LOAD2:%.*]] = load float, float*
+; CHECK-UNORDERED: [[MULADD:%.*]] = tail call float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD2]], float [[SUM_07]])
+; CHECK-UNORDERED: [[MULADD2]] = tail call float @llvm.fmuladd.f32(float [[LOAD]], float [[LOAD2]], float [[MULADD]])
+; CHECK-UNORDERED: for.end:
+; CHECK-UNORDERED: [[RES:%.*]] = phi float [ [[MULADD2]], %for.body ], [ [[RDX]], %middle.block ]
+; CHECK-UNORDERED: ret float [[RES]]
+
+; CHECK-NOT-VECTORIZED-LABEL: @fmuladd_multiple
+; CHECK-NOT-VECTORIZED-NOT: vector.body:
entry:
br label %for.body
; Same as above but the first fmuladd is one of the mul operands of the second fmuladd.
define float @multiple_fmuladds_mul_operand(float* %a, float* %b, i64 %n) {
-; CHECK-NOT-VECTORIZED-LABEL: @multiple_fmuladds_mul_operand(
-; CHECK-NOT-VECTORIZED-NEXT: entry:
-; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED: for.body:
-; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD2:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[MULADD:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[SUM_07]])
-; CHECK-NOT-VECTORIZED-NEXT: [[MULADD2]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[MULADD]], float [[TMP1]])
-; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]]
-; CHECK-NOT-VECTORIZED: for.end:
-; CHECK-NOT-VECTORIZED-NEXT: [[MULADD2_LCSSA:%.*]] = phi float [ [[MULADD2]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: ret float [[MULADD2_LCSSA]]
-;
-; CHECK-UNORDERED-LABEL: @multiple_fmuladds_mul_operand(
-; CHECK-UNORDERED-NEXT: entry:
-; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED: for.body:
-; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD2:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-UNORDERED-NEXT: [[MULADD:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[SUM_07]])
-; CHECK-UNORDERED-NEXT: [[MULADD2]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[MULADD]], float [[TMP1]])
-; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP34]]
-; CHECK-UNORDERED: for.end:
-; CHECK-UNORDERED-NEXT: [[MULADD2_LCSSA:%.*]] = phi float [ [[MULADD2]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: ret float [[MULADD2_LCSSA]]
-;
-; CHECK-ORDERED-LABEL: @multiple_fmuladds_mul_operand(
-; CHECK-ORDERED-NEXT: entry:
-; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED: for.body:
-; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD2:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-ORDERED-NEXT: [[MULADD:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[SUM_07]])
-; CHECK-ORDERED-NEXT: [[MULADD2]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[MULADD]], float [[TMP1]])
-; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP34]]
-; CHECK-ORDERED: for.end:
-; CHECK-ORDERED-NEXT: [[MULADD2_LCSSA:%.*]] = phi float [ [[MULADD2]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: ret float [[MULADD2_LCSSA]]
-;
+; CHECK-ORDERED-LABEL: @multiple_fmuladds_mul_operand
+; CHECK-ORDERED-NOT: vector.body
+; CHECK-UNORDERED-LABEL: @multiple_fmuladds_mul_operand
+; CHECK-UNORDERED-NOT: vector.body
+; CHECK-NOT-VECTORIZED-LABEL: @multiple_fmuladds_mul_operand
+; CHECK-NOT-VECTORIZED-NOT: vector.body
entry:
br label %for.body
; Same as above but the first fmuladd is two of the operands of the second fmuladd.
define float @multiple_fmuladds_two_operands(float* %a, float* %b, i64 %n) {
-; CHECK-NOT-VECTORIZED-LABEL: @multiple_fmuladds_two_operands(
-; CHECK-NOT-VECTORIZED-NEXT: entry:
-; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED: for.body:
-; CHECK-NOT-VECTORIZED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD2:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-NOT-VECTORIZED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[MULADD:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[SUM_07]])
-; CHECK-NOT-VECTORIZED-NEXT: [[MULADD2]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[MULADD]], float [[MULADD]])
-; CHECK-NOT-VECTORIZED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4]]
-; CHECK-NOT-VECTORIZED: for.end:
-; CHECK-NOT-VECTORIZED-NEXT: [[MULADD2_LCSSA:%.*]] = phi float [ [[MULADD2]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: ret float [[MULADD2_LCSSA]]
-;
-; CHECK-UNORDERED-LABEL: @multiple_fmuladds_two_operands(
-; CHECK-UNORDERED-NEXT: entry:
-; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED: for.body:
-; CHECK-UNORDERED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD2:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-UNORDERED-NEXT: [[MULADD:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[SUM_07]])
-; CHECK-UNORDERED-NEXT: [[MULADD2]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[MULADD]], float [[MULADD]])
-; CHECK-UNORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-UNORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP34]]
-; CHECK-UNORDERED: for.end:
-; CHECK-UNORDERED-NEXT: [[MULADD2_LCSSA:%.*]] = phi float [ [[MULADD2]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: ret float [[MULADD2_LCSSA]]
-;
-; CHECK-ORDERED-LABEL: @multiple_fmuladds_two_operands(
-; CHECK-ORDERED-NEXT: entry:
-; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED: for.body:
-; CHECK-ORDERED-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[SUM_07:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[MULADD2:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[IV]]
-; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[IV]]
-; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-ORDERED-NEXT: [[MULADD:%.*]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[TMP1]], float [[SUM_07]])
-; CHECK-ORDERED-NEXT: [[MULADD2]] = tail call float @llvm.fmuladd.f32(float [[TMP0]], float [[MULADD]], float [[MULADD]])
-; CHECK-ORDERED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-ORDERED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N:%.*]]
-; CHECK-ORDERED-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP34]]
-; CHECK-ORDERED: for.end:
-; CHECK-ORDERED-NEXT: [[MULADD2_LCSSA:%.*]] = phi float [ [[MULADD2]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: ret float [[MULADD2_LCSSA]]
-;
+; CHECK-ORDERED-LABEL: @multiple_fmuladds_two_operands
+; CHECK-ORDERED-NOT: vector.body
+; CHECK-UNORDERED-LABEL: @multiple_fmuladds_two_operands
+; CHECK-UNORDERED-NOT: vector.body
+; CHECK-NOT-VECTORIZED-LABEL: @multiple_fmuladds_two_operands
+; CHECK-NOT-VECTORIZED-NOT: vector.body
entry:
br label %for.body
; Test case with invariant store where fadd is strict.
define void @reduction_store_to_invariant_address(float* %dst, float* readonly %src) {
-; CHECK-NOT-VECTORIZED-LABEL: @reduction_store_to_invariant_address(
-; CHECK-NOT-VECTORIZED-NEXT: entry:
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 42
-; CHECK-NOT-VECTORIZED-NEXT: store float 0.000000e+00, float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-NOT-VECTORIZED: for.body:
-; CHECK-NOT-VECTORIZED-NEXT: [[TMP0:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NOT-VECTORIZED-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NOT-VECTORIZED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX1]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[ADD]] = fadd float [[TMP0]], [[TMP1]]
-; CHECK-NOT-VECTORIZED-NEXT: store float [[ADD]], float* [[ARRAYIDX]], align 4
-; CHECK-NOT-VECTORIZED-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NOT-VECTORIZED-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1000
-; CHECK-NOT-VECTORIZED-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
-; CHECK-NOT-VECTORIZED: for.cond.cleanup:
-; CHECK-NOT-VECTORIZED-NEXT: ret void
-;
-; CHECK-UNORDERED-LABEL: @reduction_store_to_invariant_address(
-; CHECK-UNORDERED-NEXT: entry:
-; CHECK-UNORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 42
-; CHECK-UNORDERED-NEXT: store float 0.000000e+00, float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-UNORDERED: for.body:
-; CHECK-UNORDERED-NEXT: [[TMP0:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-UNORDERED-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 [[INDVARS_IV]]
-; CHECK-UNORDERED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX1]], align 4
-; CHECK-UNORDERED-NEXT: [[ADD]] = fadd float [[TMP0]], [[TMP1]]
-; CHECK-UNORDERED-NEXT: store float [[ADD]], float* [[ARRAYIDX]], align 4
-; CHECK-UNORDERED-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-UNORDERED-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1000
-; CHECK-UNORDERED-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
-; CHECK-UNORDERED: for.cond.cleanup:
-; CHECK-UNORDERED-NEXT: ret void
-;
; CHECK-ORDERED-LABEL: @reduction_store_to_invariant_address(
-; CHECK-ORDERED-NEXT: entry:
-; CHECK-ORDERED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 42
-; CHECK-ORDERED-NEXT: store float 0.000000e+00, float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-ORDERED: for.body:
-; CHECK-ORDERED-NEXT: [[TMP0:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-ORDERED-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 [[INDVARS_IV]]
-; CHECK-ORDERED-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX1]], align 4
-; CHECK-ORDERED-NEXT: [[ADD]] = fadd float [[TMP0]], [[TMP1]]
-; CHECK-ORDERED-NEXT: store float [[ADD]], float* [[ARRAYIDX]], align 4
-; CHECK-ORDERED-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-ORDERED-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1000
-; CHECK-ORDERED-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
-; CHECK-ORDERED: for.cond.cleanup:
-; CHECK-ORDERED-NEXT: ret void
-;
+; CHECK-ORDERED-NOT: vector.body
+; CHECK-UNORDERED-LABEL: @reduction_store_to_invariant_address(
+; CHECK-UNORDERED-NOT: vector.body
+; CHECK-NOT-VECTORIZED-LABEL: @reduction_store_to_invariant_address(
+; CHECK-NOT-VECTORIZED-NOT: vector.body
entry:
%arrayidx = getelementptr inbounds float, float* %dst, i64 42
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -mattr=+sve < %s -S | FileCheck %s
define void @cmpsel_i32(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i64 %n) {
; CHECK-LABEL: @cmpsel_i32(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 2
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 2
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <vscale x 4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT: [[TMP7:%.*]] = select <vscale x 4 x i1> [[TMP6]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 10, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <vscale x 4 x i32>*
-; CHECK-NEXT: store <vscale x 4 x i32> [[TMP7]], <vscale x 4 x i32>* [[TMP9]], align 4
-; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP11:%.*]] = shl i64 [[TMP10]], 2
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
-; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP13]], 0
-; CHECK-NEXT: [[COND:%.*]] = select i1 [[TOBOOL_NOT]], i32 2, i32 10
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store i32 [[COND]], i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK: for.end.loopexit:
-; CHECK-NEXT: br label [[FOR_END:%.*]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* {{.*}}, align 4
+; CHECK-NEXT: [[TMP1:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT: [[TMP2:%.*]] = select <vscale x 4 x i1> [[TMP1]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 10, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK: store <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32>* {{.*}}, align 4
;
entry:
br label %for.body
define void @cmpsel_f32(float* noalias nocapture %a, float* noalias nocapture readonly %b, i64 %n) {
; CHECK-LABEL: @cmpsel_f32(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 2
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 2
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <vscale x 4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = fcmp ogt <vscale x 4 x float> [[WIDE_LOAD]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP7:%.*]] = select <vscale x 4 x i1> [[TMP6]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 1.000000e+01, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 2.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <vscale x 4 x float>*
-; CHECK-NEXT: store <vscale x 4 x float> [[TMP7]], <vscale x 4 x float>* [[TMP9]], align 4
-; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP11:%.*]] = shl i64 [[TMP10]], 2
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
-; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP13:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP13]], 3.000000e+00
-; CHECK-NEXT: [[CONV:%.*]] = select i1 [[CMP1]], float 1.000000e+01, float 2.000000e+00
-; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store float [[CONV]], float* [[ARRAYIDX3]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* {{.*}}, align 4
+; CHECK-NEXT: [[TMP1:%.*]] = fcmp ogt <vscale x 4 x float> [[WIDE_LOAD]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT: [[TMP2:%.*]] = select <vscale x 4 x i1> [[TMP1]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 1.000000e+01, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 2.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK: store <vscale x 4 x float> [[TMP2]], <vscale x 4 x float>* {{.*}}, align 4
entry:
br label %for.body
define void @fneg_f32(float* noalias nocapture %a, float* noalias nocapture readonly %b, i64 %n) {
; CHECK-LABEL: @fneg_f32(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 2
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 2
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <vscale x 4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = fneg <vscale x 4 x float> [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP7]] to <vscale x 4 x float>*
-; CHECK-NEXT: store <vscale x 4 x float> [[TMP6]], <vscale x 4 x float>* [[TMP8]], align 4
-; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP10:%.*]] = shl i64 [[TMP9]], 2
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP12:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[FNEG:%.*]] = fneg float [[TMP12]]
-; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store float [[FNEG]], float* [[ARRAYIDX3]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* {{.*}}, align 4
+; CHECK-NEXT: [[TMP1:%.*]] = fneg <vscale x 4 x float> [[WIDE_LOAD]]
+; CHECK: store <vscale x 4 x float> [[TMP1]], <vscale x 4 x float>* {{.*}}, align 4
entry:
br label %for.body
; CHECK-NEXT: [[N_VEC6:%.*]] = sub i64 [[N]], [[N_MOD_VF5]]
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; CHECK: vec.epilog.vector.body:
-; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX8:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI9:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[TMP30:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[INDEX8]], 0
; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP25]]
; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i64, i64* [[TMP26]], i32 0
; CHECK-NEXT: [[TMP28:%.*]] = bitcast i64* [[TMP27]] to <2 x i64>*
; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <2 x i64>, <2 x i64>* [[TMP28]], align 4
; CHECK-NEXT: [[TMP29:%.*]] = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> [[WIDE_LOAD10]])
; CHECK-NEXT: [[TMP30]] = and i64 [[TMP29]], [[VEC_PHI9]]
-; CHECK-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[INDEX8]], 2
; CHECK-NEXT: [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC6]]
; CHECK-NEXT: br i1 [[TMP31]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
; CHECK: vec.epilog.middle.block:
; CHECK-NEXT: [[TMP24:%.*]] = insertelement <2 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; CHECK: vec.epilog.vector.body:
-; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX8:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI9:%.*]] = phi <2 x i64> [ [[TMP24]], [[VEC_EPILOG_PH]] ], [ [[TMP29:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[INDEX8]], 0
; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP25]]
; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i64, i64* [[TMP26]], i32 0
; CHECK-NEXT: [[TMP28:%.*]] = bitcast i64* [[TMP27]] to <2 x i64>*
; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <2 x i64>, <2 x i64>* [[TMP28]], align 4
; CHECK-NEXT: [[TMP29]] = add <2 x i64> [[WIDE_LOAD10]], [[VEC_PHI9]]
-; CHECK-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[INDEX8]], 2
; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC6]]
; CHECK-NEXT: br i1 [[TMP30]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
; CHECK: vec.epilog.middle.block:
; CHECK-NEXT: [[ADD]] = add i64 [[TMP32]], [[SUM]]
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: for.end.loopexit:
; CHECK-NEXT: [[ADD_LCSSA4:%.*]] = phi i64 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP31]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
; CHECK-NEXT: br label [[FOR_END]]
; CHECK-NEXT: [[N_VEC5:%.*]] = sub i64 [[N]], [[N_MOD_VF4]]
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; CHECK: vec.epilog.vector.body:
-; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT10:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX7:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT10:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI8:%.*]] = phi float [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[TMP27:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP23:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[TMP23:%.*]] = add i64 [[INDEX7]], 0
; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP23]]
; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, float* [[TMP24]], i32 0
; CHECK-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP25]] to <2 x float>*
; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <2 x float>, <2 x float>* [[TMP26]], align 4
; CHECK-NEXT: [[TMP27]] = call float @llvm.vector.reduce.fadd.v2f32(float [[VEC_PHI8]], <2 x float> [[WIDE_LOAD9]])
-; CHECK-NEXT: [[INDEX_NEXT10]] = add nuw i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT: [[INDEX_NEXT10]] = add nuw i64 [[INDEX7]], 2
; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT10]], [[N_VEC5]]
; CHECK-NEXT: br i1 [[TMP28]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
; CHECK: vec.epilog.middle.block:
; CHECK-NEXT: [[ADD]] = fadd float [[TMP29]], [[SUM_07]]
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: for.end.loopexit:
; CHECK-NEXT: [[ADD_LCSSA3:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP27]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
; CHECK-NEXT: br label [[FOR_END]]
; CHECK-NEXT: [[N_VEC3:%.*]] = sub i64 1024, [[N_MOD_VF2]]
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; CHECK: vec.epilog.vector.body:
-; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP27:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP27:%.*]] = add i64 [[INDEX5]], 0
; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[TMP27]]
; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, i8* [[TMP28]], i32 0
; CHECK-NEXT: [[TMP30:%.*]] = bitcast i8* [[TMP29]] to <vscale x 8 x i8>*
; CHECK-NEXT: store <vscale x 8 x i8> shufflevector (<vscale x 8 x i8> insertelement (<vscale x 8 x i8> poison, i8 1, i32 0), <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer), <vscale x 8 x i8>* [[TMP30]], align 1
; CHECK-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP32:%.*]] = mul i64 [[TMP31]], 8
-; CHECK-NEXT: [[INDEX_NEXT6]] = add nuw i64 [[OFFSET_IDX]], [[TMP32]]
+; CHECK-NEXT: [[INDEX_NEXT6]] = add nuw i64 [[INDEX5]], [[TMP32]]
; CHECK-NEXT: [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC3]]
; CHECK-NEXT: br i1 [[TMP33]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
; CHECK: vec.epilog.middle.block:
; CHECK-VF8-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
; CHECK-VF8-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; CHECK-VF8: vec.epilog.vector.body:
-; CHECK-VF8-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-VF8-NEXT: [[TMP21:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-VF8-NEXT: [[INDEX2:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-VF8-NEXT: [[TMP21:%.*]] = add i64 [[INDEX2]], 0
; CHECK-VF8-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[TMP21]]
; CHECK-VF8-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, i8* [[TMP22]], i32 0
; CHECK-VF8-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP23]] to <8 x i8>*
; CHECK-VF8-NEXT: store <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, <8 x i8>* [[TMP24]], align 1
-; CHECK-VF8-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[OFFSET_IDX]], 8
+; CHECK-VF8-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX2]], 8
; CHECK-VF8-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 1024
; CHECK-VF8-NEXT: br i1 [[TMP25]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
; CHECK-VF8: vec.epilog.middle.block:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; CHECK: vec.epilog.vector.body:
-; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX2]], 0
; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP21]]
; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i64, i64* [[TMP22]], i32 0
; CHECK-NEXT: [[TMP24:%.*]] = bitcast i64* [[TMP23]] to <8 x i64>*
; CHECK-NEXT: store <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>, <8 x i64>* [[TMP24]], align 1
-; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[OFFSET_IDX]], 8
+; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX2]], 8
; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 1024
; CHECK-NEXT: br i1 [[TMP25]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: vec.epilog.middle.block:
; CHECK-VF8-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
; CHECK-VF8-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; CHECK-VF8: vec.epilog.vector.body:
-; CHECK-VF8-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-VF8-NEXT: [[TMP21:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-VF8-NEXT: [[INDEX2:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-VF8-NEXT: [[TMP21:%.*]] = add i64 [[INDEX2]], 0
; CHECK-VF8-NEXT: [[TMP22:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP21]]
; CHECK-VF8-NEXT: [[TMP23:%.*]] = getelementptr inbounds i64, i64* [[TMP22]], i32 0
; CHECK-VF8-NEXT: [[TMP24:%.*]] = bitcast i64* [[TMP23]] to <8 x i64>*
; CHECK-VF8-NEXT: store <8 x i64> <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>, <8 x i64>* [[TMP24]], align 1
-; CHECK-VF8-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[OFFSET_IDX]], 8
+; CHECK-VF8-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX2]], 8
; CHECK-VF8-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 1024
; CHECK-VF8-NEXT: br i1 [[TMP25]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK-VF8: vec.epilog.middle.block:
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -S < %s | FileCheck %s
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
target triple = "aarch64-unknown-linux-gnu"
define void @inv_store_last_lane(i32* noalias nocapture %a, i32* noalias nocapture %inv, i32* noalias nocapture readonly %b, i64 %n) #0 {
-; CHECK-LABEL: @inv_store_last_lane(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 2
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 2
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <vscale x 4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = shl nsw <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <vscale x 4 x i32>*
-; CHECK-NEXT: store <vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32>* [[TMP8]], align 4
-; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP10:%.*]] = shl i64 [[TMP9]], 2
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP13:%.*]] = shl i32 [[TMP12]], 2
-; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], -1
-; CHECK-NEXT: [[TMP15:%.*]] = extractelement <vscale x 4 x i32> [[TMP6]], i32 [[TMP14]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[MUL:%.*]] = shl nsw i32 [[TMP16]], 1
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store i32 [[MUL]], i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK: exit:
-; CHECK-NEXT: [[MUL_LCSSA:%.*]] = phi i32 [ [[MUL]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[INV:%.*]], i64 42
-; CHECK-NEXT: store i32 [[MUL_LCSSA]], i32* [[ARRAYIDX5]], align 4
-; CHECK-NEXT: ret void
-;
+; CHECK-LABEL: @inv_store_last_lane
+; CHECK: vector.body:
+; CHECK: store <vscale x 4 x i32> %[[VEC_VAL:.*]], <
+; CHECK: middle.block:
+; CHECK: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: %[[VSCALE2:.*]] = shl i32 %[[VSCALE]], 2
+; CHECK-NEXT: %[[LAST_LANE:.*]] = add i32 %[[VSCALE2]], -1
+; CHECK-NEXT: %{{.*}} = extractelement <vscale x 4 x i32> %[[VEC_VAL]], i32 %[[LAST_LANE]]
entry:
br label %for.body
}
define float @ret_last_lane(float* noalias nocapture %a, float* noalias nocapture readonly %b, i64 %n) #0 {
-; CHECK-LABEL: @ret_last_lane(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 2
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 2
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <vscale x 4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 2.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP7]] to <vscale x 4 x float>*
-; CHECK-NEXT: store <vscale x 4 x float> [[TMP6]], <vscale x 4 x float>* [[TMP8]], align 4
-; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP10:%.*]] = shl i64 [[TMP9]], 2
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP13:%.*]] = shl i32 [[TMP12]], 2
-; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], -1
-; CHECK-NEXT: [[TMP15:%.*]] = extractelement <vscale x 4 x float> [[TMP6]], i32 [[TMP14]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP16:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP16]], 2.000000e+00
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store float [[MUL]], float* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK: exit:
-; CHECK-NEXT: [[MUL_LCSSA:%.*]] = phi float [ [[MUL]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: ret float [[MUL_LCSSA]]
-;
+; CHECK-LABEL: @ret_last_lane
+; CHECK: vector.body:
+; CHECK: store <vscale x 4 x float> %[[VEC_VAL:.*]], <
+; CHECK: middle.block:
+; CHECK: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: %[[VSCALE2:.*]] = shl i32 %[[VSCALE]], 2
+; CHECK-NEXT: %[[LAST_LANE:.*]] = add i32 %[[VSCALE2]], -1
+; CHECK-NEXT: %{{.*}} = extractelement <vscale x 4 x float> %[[VEC_VAL]], i32 %[[LAST_LANE]]
entry:
br label %for.body
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -loop-vectorize -mattr=+sve -force-vector-width=4 -pass-remarks-analysis=loop-vectorize -S 2>%t | FileCheck %s
; RUN: cat %t | FileCheck %s -check-prefix=CHECK-REMARKS
target triple = "aarch64-linux-gnu"
; CHECK-REMARKS: Scalable vectorization is not supported for all element types found in this loop
define dso_local void @loop_sve_i128(i128* nocapture %ptr, i64 %N) {
-; CHECK-LABEL: @loop_sve_i128(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 2
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[INDUCTION:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[INDUCTION1:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i128, i128* [[PTR:%.*]], i64 [[INDUCTION]]
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i128, i128* [[PTR]], i64 [[INDUCTION1]]
-; CHECK-NEXT: [[TMP2:%.*]] = load i128, i128* [[TMP0]], align 16
-; CHECK-NEXT: [[TMP3:%.*]] = load i128, i128* [[TMP1]], align 16
-; CHECK-NEXT: [[TMP4:%.*]] = add nsw i128 [[TMP2]], 42
-; CHECK-NEXT: [[TMP5:%.*]] = add nsw i128 [[TMP3]], 42
-; CHECK-NEXT: store i128 [[TMP4]], i128* [[TMP0]], align 16
-; CHECK-NEXT: store i128 [[TMP5]], i128* [[TMP1]], align 16
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i128, i128* [[PTR]], i64 [[IV]]
-; CHECK-NEXT: [[TMP7:%.*]] = load i128, i128* [[ARRAYIDX]], align 16
-; CHECK-NEXT: [[ADD:%.*]] = add nsw i128 [[TMP7]], 42
-; CHECK-NEXT: store i128 [[ADD]], i128* [[ARRAYIDX]], align 16
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK-LABEL: @loop_sve_i128
+; CHECK: vector.body
+; CHECK: %[[LOAD1:.*]] = load i128, i128* {{.*}}
+; CHECK-NEXT: %[[LOAD2:.*]] = load i128, i128* {{.*}}
+; CHECK-NEXT: %[[ADD1:.*]] = add nsw i128 %[[LOAD1]], 42
+; CHECK-NEXT: %[[ADD2:.*]] = add nsw i128 %[[LOAD2]], 42
+; CHECK-NEXT: store i128 %[[ADD1]], i128* {{.*}}
+; CHECK-NEXT: store i128 %[[ADD2]], i128* {{.*}}
entry:
br label %for.body
; CHECK-REMARKS: Scalable vectorization is not supported for all element types found in this loop
define dso_local void @loop_sve_f128(fp128* nocapture %ptr, i64 %N) {
-; CHECK-LABEL: @loop_sve_f128(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 2
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[INDUCTION:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[INDUCTION1:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds fp128, fp128* [[PTR:%.*]], i64 [[INDUCTION]]
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds fp128, fp128* [[PTR]], i64 [[INDUCTION1]]
-; CHECK-NEXT: [[TMP2:%.*]] = load fp128, fp128* [[TMP0]], align 16
-; CHECK-NEXT: [[TMP3:%.*]] = load fp128, fp128* [[TMP1]], align 16
-; CHECK-NEXT: [[TMP4:%.*]] = fsub fp128 [[TMP2]], 0xL00000000000000008000000000000000
-; CHECK-NEXT: [[TMP5:%.*]] = fsub fp128 [[TMP3]], 0xL00000000000000008000000000000000
-; CHECK-NEXT: store fp128 [[TMP4]], fp128* [[TMP0]], align 16
-; CHECK-NEXT: store fp128 [[TMP5]], fp128* [[TMP1]], align 16
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds fp128, fp128* [[PTR]], i64 [[IV]]
-; CHECK-NEXT: [[TMP7:%.*]] = load fp128, fp128* [[ARRAYIDX]], align 16
-; CHECK-NEXT: [[ADD:%.*]] = fsub fp128 [[TMP7]], 0xL00000000000000008000000000000000
-; CHECK-NEXT: store fp128 [[ADD]], fp128* [[ARRAYIDX]], align 16
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK-LABEL: @loop_sve_f128
+; CHECK: vector.body
+; CHECK: %[[LOAD1:.*]] = load fp128, fp128*
+; CHECK-NEXT: %[[LOAD2:.*]] = load fp128, fp128*
+; CHECK-NEXT: %[[FSUB1:.*]] = fsub fp128 %[[LOAD1]], 0xL00000000000000008000000000000000
+; CHECK-NEXT: %[[FSUB2:.*]] = fsub fp128 %[[LOAD2]], 0xL00000000000000008000000000000000
+; CHECK-NEXT: store fp128 %[[FSUB1]], fp128* {{.*}}
+; CHECK-NEXT: store fp128 %[[FSUB2]], fp128* {{.*}}
entry:
br label %for.body
; CHECK-REMARKS: Scalable vectorization is not supported for all element types found in this loop
define dso_local void @loop_invariant_sve_i128(i128* nocapture %ptr, i128 %val, i64 %N) {
-; CHECK-LABEL: @loop_invariant_sve_i128(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 2
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[INDUCTION:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[INDUCTION1:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i128, i128* [[PTR:%.*]], i64 [[INDUCTION]]
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i128, i128* [[PTR]], i64 [[INDUCTION1]]
-; CHECK-NEXT: store i128 [[VAL:%.*]], i128* [[TMP0]], align 16
-; CHECK-NEXT: store i128 [[VAL]], i128* [[TMP1]], align 16
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i128, i128* [[PTR]], i64 [[IV]]
-; CHECK-NEXT: store i128 [[VAL]], i128* [[ARRAYIDX]], align 16
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK-LABEL: @loop_invariant_sve_i128
+; CHECK: vector.body
+; CHECK: %[[GEP1:.*]] = getelementptr inbounds i128, i128* %ptr
+; CHECK-NEXT: %[[GEP2:.*]] = getelementptr inbounds i128, i128* %ptr
+; CHECK-NEXT: store i128 %val, i128* %[[GEP1]]
+; CHECK-NEXT: store i128 %val, i128* %[[GEP2]]
entry:
br label %for.body
; CHECK-REMARKS: Scalable vectorization is not supported for all element types found in this loop
define void @uniform_store_i1(i1* noalias %dst, i64* noalias %start, i64 %N) {
-; CHECK-LABEL: @uniform_store_i1(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
-; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i64, i64* [[START:%.*]], i64 [[N_VEC]]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64*> poison, i64* [[START]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64*> [[BROADCAST_SPLATINSERT]], <2 x i64*> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x i64*> poison, i64* [[START]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x i64*> [[BROADCAST_SPLATINSERT3]], <2 x i64*> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi i64* [ [[START]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i64, i64* [[POINTER_PHI]], <2 x i64> <i64 0, i64 1>
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, i64* [[POINTER_PHI]], <2 x i64> <i64 2, i64 3>
-; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64*> [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i64, i64* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i64* [[TMP4]] to <2 x i64>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, <2 x i64>* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i64, i64* [[TMP3]], i32 2
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast i64* [[TMP6]] to <2 x i64>*
-; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP7]], align 4
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, <2 x i64*> [[TMP1]], i64 1
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, <2 x i64*> [[TMP2]], i64 1
-; CHECK-NEXT: [[TMP10:%.*]] = icmp eq <2 x i64*> [[TMP8]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq <2 x i64*> [[TMP9]], [[BROADCAST_SPLAT4]]
-; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0
-; CHECK-NEXT: store i1 [[TMP12]], i1* [[DST:%.*]], align 1
-; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1
-; CHECK-NEXT: store i1 [[TMP13]], i1* [[DST]], align 1
-; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP11]], i32 0
-; CHECK-NEXT: store i1 [[TMP14]], i1* [[DST]], align 1
-; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i1> [[TMP11]], i32 1
-; CHECK-NEXT: store i1 [[TMP15]], i1* [[DST]], align 1
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[PTR_IND]] = getelementptr i64, i64* [[POINTER_PHI]], i64 4
-; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[FIRST_SROA:%.*]] = phi i64* [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT: [[TMP17:%.*]] = load i64, i64* [[FIRST_SROA]], align 4
-; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i64, i64* [[FIRST_SROA]], i64 1
-; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i64* [[INCDEC_PTR]], [[START]]
-; CHECK-NEXT: store i1 [[CMP_NOT]], i1* [[DST]], align 1
-; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[IV]], [[N]]
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[END]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK: end:
-; CHECK-NEXT: ret void
-;
+; CHECK-LABEL: @uniform_store_i1
+; CHECK: vector.body
+; CHECK: %[[GEP:.*]] = getelementptr inbounds i64, <2 x i64*> {{.*}}, i64 1
+; CHECK: %[[ICMP:.*]] = icmp eq <2 x i64*> %[[GEP]], %[[SPLAT:.*]]
+; CHECK: %[[EXTRACT1:.*]] = extractelement <2 x i1> %[[ICMP]], i32 0
+; CHECK: store i1 %[[EXTRACT1]], i1* %dst
+; CHECK: %[[EXTRACT2:.*]] = extractelement <2 x i1> %[[ICMP]], i32 1
+; CHECK: store i1 %[[EXTRACT2]], i1* %dst
+; CHECK-NOT: vscale
entry:
br label %for.body
}
define dso_local void @loop_fixed_width_i128(i128* nocapture %ptr, i64 %N) {
-; CHECK-LABEL: @loop_fixed_width_i128(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i128, i128* [[PTR:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i128, i128* [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast i128* [[TMP2]] to <4 x i128>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i128>, <4 x i128>* [[TMP3]], align 16
-; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i128> [[WIDE_LOAD]], <i128 42, i128 42, i128 42, i128 42>
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i128* [[TMP2]] to <4 x i128>*
-; CHECK-NEXT: store <4 x i128> [[TMP4]], <4 x i128>* [[TMP5]], align 16
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i128, i128* [[PTR]], i64 [[IV]]
-; CHECK-NEXT: [[TMP7:%.*]] = load i128, i128* [[ARRAYIDX]], align 16
-; CHECK-NEXT: [[ADD:%.*]] = add nsw i128 [[TMP7]], 42
-; CHECK-NEXT: store i128 [[ADD]], i128* [[ARRAYIDX]], align 16
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK-LABEL: @loop_fixed_width_i128
+; CHECK: load <4 x i128>, <4 x i128>*
+; CHECK: add nsw <4 x i128> {{.*}}, <i128 42, i128 42, i128 42, i128 42>
+; CHECK: store <4 x i128> {{.*}} <4 x i128>*
+; CHECK-NOT: vscale
entry:
br label %for.body
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; REQUIRES: asserts
; RUN: opt -loop-vectorize -S < %s -debug 2>%t | FileCheck %s
; RUN: cat %t | FileCheck %s --check-prefix=DEBUG
define void @induction_i7(i64* %dst) #0 {
; CHECK-LABEL: @induction_i7(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 64, [[TMP1]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 64, [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 64, [[N_MOD_VF]]
-; CHECK-NEXT: [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i7
-; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 2 x i8> @llvm.experimental.stepvector.nxv2i8()
-; CHECK-NEXT: [[TMP5:%.*]] = trunc <vscale x 2 x i8> [[TMP4]] to <vscale x 2 x i7>
+; CHECK: [[TMP4:%.*]] = call <vscale x 2 x i8> @llvm.experimental.stepvector.nxv2i8()
+; CHECK: [[TMP5:%.*]] = trunc <vscale x 2 x i8> %4 to <vscale x 2 x i7>
; CHECK-NEXT: [[TMP6:%.*]] = add <vscale x 2 x i7> [[TMP5]], zeroinitializer
; CHECK-NEXT: [[TMP7:%.*]] = mul <vscale x 2 x i7> [[TMP6]], shufflevector (<vscale x 2 x i7> insertelement (<vscale x 2 x i7> poison, i7 1, i32 0), <vscale x 2 x i7> poison, <vscale x 2 x i32> zeroinitializer)
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i7> zeroinitializer, [[TMP7]]
-; CHECK-NEXT: [[TMP8:%.*]] = call i7 @llvm.vscale.i7()
-; CHECK-NEXT: [[TMP9:%.*]] = mul i7 [[TMP8]], 2
-; CHECK-NEXT: [[TMP10:%.*]] = mul i7 1, [[TMP9]]
-; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i7> poison, i7 [[TMP10]], i32 0
-; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i7> [[DOTSPLATINSERT]], <vscale x 2 x i7> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i7> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP12:%.*]] = add <vscale x 2 x i7> [[VEC_IND]], zeroinitializer
-; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, i64* [[DST:%.*]], i64 [[TMP11]]
-; CHECK-NEXT: [[TMP14:%.*]] = zext <vscale x 2 x i7> [[TMP12]] to <vscale x 2 x i64>
-; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, i64* [[TMP13]], i32 0
-; CHECK-NEXT: [[TMP16:%.*]] = bitcast i64* [[TMP15]] to <vscale x 2 x i64>*
-; CHECK-NEXT: store <vscale x 2 x i64> [[TMP14]], <vscale x 2 x i64>* [[TMP16]], align 8
-; CHECK-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 2
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP18]]
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i7> [[VEC_IND]], [[DOTSPLAT]]
-; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 64, [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i7 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV1294:%.*]] = phi i7 [ [[INDVARS_IV_NEXT1295:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[INDVARS_IV1286:%.*]] = phi i64 [ [[INDVARS_IV_NEXT1287:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[ADDI7:%.*]] = add i7 [[INDVARS_IV1294]], 0
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[DST]], i64 [[INDVARS_IV1286]]
-; CHECK-NEXT: [[EXT:%.*]] = zext i7 [[ADDI7]] to i64
-; CHECK-NEXT: store i64 [[EXT]], i64* [[ARRAYIDX]], align 8
-; CHECK-NEXT: [[INDVARS_IV_NEXT1287]] = add nuw nsw i64 [[INDVARS_IV1286]], 1
-; CHECK-NEXT: [[INDVARS_IV_NEXT1295]] = add i7 [[INDVARS_IV1294]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT1287]], 64
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i7> [ [[INDUCTION]], %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP11:%.*]] = add <vscale x 2 x i7> [[VEC_IND]], zeroinitializer
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, i64* [[DST:%.*]], i64 [[TMP10]]
+; CHECK-NEXT: [[EXT:%.+]] = zext <vscale x 2 x i7> [[TMP11]] to <vscale x 2 x i64>
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, i64* [[TMP12]], i32 0
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64* [[TMP13]] to <vscale x 2 x i64>*
+; CHECK-NEXT: store <vscale x 2 x i64> [[EXT]], <vscale x 2 x i64>* [[TMP14]], align 8
+; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 2
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]]
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i7> [[VEC_IND]],
;
entry:
br label %for.body
define void @induction_i3_zext(i64* %dst) #0 {
; CHECK-LABEL: @induction_i3_zext(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 64, [[TMP1]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 64, [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 64, [[N_MOD_VF]]
-; CHECK-NEXT: [[IND_END:%.*]] = trunc i64 [[N_VEC]] to i3
-; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 2 x i8> @llvm.experimental.stepvector.nxv2i8()
-; CHECK-NEXT: [[TMP5:%.*]] = trunc <vscale x 2 x i8> [[TMP4]] to <vscale x 2 x i3>
+; CHECK: [[TMP4:%.*]] = call <vscale x 2 x i8> @llvm.experimental.stepvector.nxv2i8()
+; CHECK: [[TMP5:%.*]] = trunc <vscale x 2 x i8> %4 to <vscale x 2 x i3>
; CHECK-NEXT: [[TMP6:%.*]] = add <vscale x 2 x i3> [[TMP5]], zeroinitializer
; CHECK-NEXT: [[TMP7:%.*]] = mul <vscale x 2 x i3> [[TMP6]], shufflevector (<vscale x 2 x i3> insertelement (<vscale x 2 x i3> poison, i3 1, i32 0), <vscale x 2 x i3> poison, <vscale x 2 x i32> zeroinitializer)
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i3> zeroinitializer, [[TMP7]]
-; CHECK-NEXT: [[TMP8:%.*]] = call i3 @llvm.vscale.i3()
-; CHECK-NEXT: [[TMP9:%.*]] = mul i3 [[TMP8]], 2
-; CHECK-NEXT: [[TMP10:%.*]] = mul i3 1, [[TMP9]]
-; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i3> poison, i3 [[TMP10]], i32 0
-; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i3> [[DOTSPLATINSERT]], <vscale x 2 x i3> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i3> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP12:%.*]] = zext <vscale x 2 x i3> [[VEC_IND]] to <vscale x 2 x i64>
-; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, i64* [[DST:%.*]], i64 [[TMP11]]
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, i64* [[TMP13]], i32 0
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast i64* [[TMP14]] to <vscale x 2 x i64>*
-; CHECK-NEXT: store <vscale x 2 x i64> [[TMP12]], <vscale x 2 x i64>* [[TMP15]], align 8
-; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 2
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP17]]
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i3> [[VEC_IND]], [[DOTSPLAT]]
-; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 64, [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i3 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV1294:%.*]] = phi i3 [ [[INDVARS_IV_NEXT1295:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[INDVARS_IV1286:%.*]] = phi i64 [ [[INDVARS_IV_NEXT1287:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[ZEXTI3:%.*]] = zext i3 [[INDVARS_IV1294]] to i64
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[DST]], i64 [[INDVARS_IV1286]]
-; CHECK-NEXT: store i64 [[ZEXTI3]], i64* [[ARRAYIDX]], align 8
-; CHECK-NEXT: [[INDVARS_IV_NEXT1287]] = add nuw nsw i64 [[INDVARS_IV1286]], 1
-; CHECK-NEXT: [[INDVARS_IV_NEXT1295]] = add i3 [[INDVARS_IV1294]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT1287]], 64
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i3> [ [[INDUCTION]], %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP10:%.*]] = zext <vscale x 2 x i3> [[VEC_IND]] to <vscale x 2 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, i64* [[DST:%.*]], i64 [[TMP9]]
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, i64* [[TMP12]], i32 0
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast i64* [[TMP13]] to <vscale x 2 x i64>*
+; CHECK-NEXT: store <vscale x 2 x i64> [[TMP10]], <vscale x 2 x i64>* [[TMP14]], align 8
+; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 2
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]]
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i3> [[VEC_IND]],
;
entry:
br label %for.body
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -loop-vectorize -mattr=+sve -mtriple aarch64-linux-gnu < %s | FileCheck %s
define void @invariant_load(i64 %n, i32* noalias nocapture %a, i32* nocapture readonly %b) {
-; CHECK-LABEL: @invariant_load(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 42
-; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP6]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <vscale x 4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[TMP9]], align 4
-; CHECK-NEXT: [[TMP10:%.*]] = add nsw <vscale x 4 x i32> [[BROADCAST_SPLAT]], [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <vscale x 4 x i32>*
-; CHECK-NEXT: store <vscale x 4 x i32> [[TMP10]], <vscale x 4 x i32>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]]
-; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 42
-; CHECK-NEXT: [[TMP17:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[IV]]
-; CHECK-NEXT: [[TMP18:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4
-; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
-; CHECK-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK-LABEL: @invariant_load
+; CHECK: vector.body:
+; CHECK: %[[GEP:.*]] = getelementptr inbounds i32, i32* %b, i64 42
+; CHECK-NEXT: %[[INVLOAD:.*]] = load i32, i32* %[[GEP]]
+; CHECK-NEXT: %[[SPLATINS:.*]] = insertelement <vscale x 4 x i32> poison, i32 %[[INVLOAD]], i32 0
+; CHECK-NEXT: %[[SPLAT:.*]] = shufflevector <vscale x 4 x i32> %[[SPLATINS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK: %[[LOAD:.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>*
+; CHECK-NEXT: %[[ADD:.*]] = add nsw <vscale x 4 x i32> %[[SPLAT]], %[[LOAD]]
+; CHECK: store <vscale x 4 x i32> %[[ADD]], <vscale x 4 x i32>*
entry:
br label %for.body
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -loop-vectorize -S < %s | FileCheck %s
target triple = "aarch64-unknown-linux-gnu"
define void @inv_store_i16(i16* noalias %dst, i16* noalias readonly %src, i64 %N) #0 {
; CHECK-LABEL: @inv_store_i16(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i16*> poison, i16* [[DST:%.*]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i16*> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i16*> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: %[[TMP1:.*]] = insertelement <vscale x 4 x i16*> poison, i16* %dst, i32 0
+; CHECK-NEXT: %[[SPLAT_PTRS:.*]] = shufflevector <vscale x 4 x i16*> %[[TMP1]], <vscale x 4 x i16*> poison, <vscale x 4 x i32> zeroinitializer
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[SRC:%.*]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 0
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <vscale x 4 x i16>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, <vscale x 4 x i16>* [[TMP7]], align 2
-; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i16.nxv4p0i16(<vscale x 4 x i16> [[WIDE_LOAD]], <vscale x 4 x i16*> [[BROADCAST_SPLAT]], i32 2, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
-; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
-; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_INC24:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY14:%.*]]
-; CHECK: for.body14:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY14]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[SRC]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[LD:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
-; CHECK-NEXT: store i16 [[LD]], i16* [[DST]], align 2
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_INC24]], label [[FOR_BODY14]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK: for.inc24:
-; CHECK-NEXT: ret void
-;
+; CHECK: %[[VECLOAD:.*]] = load <vscale x 4 x i16>, <vscale x 4 x i16>* %{{.*}}, align 2
+; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i16.nxv4p0i16(<vscale x 4 x i16> %[[VECLOAD]], <vscale x 4 x i16*> %[[SPLAT_PTRS]], i32 2
entry:
br label %for.body14
define void @cond_inv_store_i32(i32* noalias %dst, i32* noalias readonly %src, i64 %N) #0 {
; CHECK-LABEL: @cond_inv_store_i32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32*> poison, i32* [[DST:%.*]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32*> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32*> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: %[[TMP1:.*]] = insertelement <vscale x 4 x i32*> poison, i32* %dst, i32 0
+; CHECK-NEXT: %[[SPLAT_PTRS:.*]] = shufflevector <vscale x 4 x i32*> %[[TMP1]], <vscale x 4 x i32*> poison, <vscale x 4 x i32> zeroinitializer
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <vscale x 4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[TMP7]], align 4
-; CHECK-NEXT: [[TMP8:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32*> [[BROADCAST_SPLAT]], i32 4, <vscale x 4 x i1> [[TMP8]])
-; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I_09:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 [[I_09]]
-; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP12]], 0
-; CHECK-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
-; CHECK: if.then:
-; CHECK-NEXT: store i32 [[TMP12]], i32* [[DST]], align 4
-; CHECK-NEXT: br label [[FOR_INC]]
-; CHECK: for.inc:
-; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_09]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK: %[[VECLOAD:.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* %{{.*}}, align 4
+; CHECK-NEXT: %[[MASK:.*]] = icmp sgt <vscale x 4 x i32> %[[VECLOAD]], zeroinitializer
+; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> %[[VECLOAD]], <vscale x 4 x i32*> %[[SPLAT_PTRS]], i32 4, <vscale x 4 x i1> %[[MASK]])
entry:
br label %for.body
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -mattr=+sve -S %s -o - | FileCheck %s
define void @mloadstore_f32(float* noalias nocapture %a, float* noalias nocapture readonly %b, i64 %n) {
-; CHECK-LABEL: @mloadstore_f32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 2
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 2
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <vscale x 4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = fcmp ogt <vscale x 4 x float> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr float, float* [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP7]] to <vscale x 4 x float>*
-; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>* [[TMP8]], i32 4, <vscale x 4 x i1> [[TMP6]], <vscale x 4 x float> poison)
-; CHECK-NEXT: [[TMP9:%.*]] = fadd <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP7]] to <vscale x 4 x float>*
-; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0nxv4f32(<vscale x 4 x float> [[TMP9]], <vscale x 4 x float>* [[TMP10]], i32 4, <vscale x 4 x i1> [[TMP6]])
-; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP12:%.*]] = shl i64 [[TMP11]], 2
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]]
-; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I_011:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[I_011]]
-; CHECK-NEXT: [[TMP14:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP14]], 0.000000e+00
-; CHECK-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
-; CHECK: if.then:
-; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[I_011]]
-; CHECK-NEXT: [[TMP15:%.*]] = load float, float* [[ARRAYIDX3]], align 4
-; CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP14]], [[TMP15]]
-; CHECK-NEXT: store float [[ADD]], float* [[ARRAYIDX3]], align 4
-; CHECK-NEXT: br label [[FOR_INC]]
-; CHECK: for.inc:
-; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_011]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK: exit:
-; CHECK-NEXT: ret void
-;
+; CHECK-LABEL: @mloadstore_f32
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <vscale x 4 x float>, <vscale x 4 x float>*
+; CHECK-NEXT: %[[MASK:.*]] = fcmp ogt <vscale x 4 x float> %[[LOAD1]],
+; CHECK-NEXT: %[[GEPA:.*]] = getelementptr float, float* %a,
+; CHECK-NEXT: %[[MLOAD_PTRS:.*]] = bitcast float* %[[GEPA]] to <vscale x 4 x float>*
+; CHECK-NEXT: %[[LOAD2:.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>* %[[MLOAD_PTRS]], i32 4, <vscale x 4 x i1> %[[MASK]]
+; CHECK-NEXT: %[[FADD:.*]] = fadd <vscale x 4 x float> %[[LOAD1]], %[[LOAD2]]
+; CHECK-NEXT: %[[MSTORE_PTRS:.*]] = bitcast float* %[[GEPA]] to <vscale x 4 x float>*
+; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0nxv4f32(<vscale x 4 x float> %[[FADD]], <vscale x 4 x float>* %[[MSTORE_PTRS]], i32 4, <vscale x 4 x i1> %[[MASK]])
entry:
br label %for.body
}
define void @mloadstore_i32(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i64 %n) {
-; CHECK-LABEL: @mloadstore_i32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 2
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 2
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <vscale x 4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <vscale x 4 x i32> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <vscale x 4 x i32>*
-; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP8]], i32 4, <vscale x 4 x i1> [[TMP6]], <vscale x 4 x i32> poison)
-; CHECK-NEXT: [[TMP9:%.*]] = add <vscale x 4 x i32> [[WIDE_LOAD]], [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP7]] to <vscale x 4 x i32>*
-; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> [[TMP9]], <vscale x 4 x i32>* [[TMP10]], i32 4, <vscale x 4 x i1> [[TMP6]])
-; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP12:%.*]] = shl i64 [[TMP11]], 2
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]]
-; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I_011:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I_011]]
-; CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[CMP1_NOT:%.*]] = icmp eq i32 [[TMP14]], 0
-; CHECK-NEXT: br i1 [[CMP1_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
-; CHECK: if.then:
-; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I_011]]
-; CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
-; CHECK-NEXT: [[ADD:%.*]] = add i32 [[TMP14]], [[TMP15]]
-; CHECK-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX3]], align 4
-; CHECK-NEXT: br label [[FOR_INC]]
-; CHECK: for.inc:
-; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_011]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK: exit:
-; CHECK-NEXT: ret void
-;
+; CHECK-LABEL: @mloadstore_i32
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>*
+; CHECK-NEXT: %[[MASK:.*]] = icmp ne <vscale x 4 x i32> %[[LOAD1]],
+; CHECK-NEXT: %[[GEPA:.*]] = getelementptr i32, i32* %a,
+; CHECK-NEXT: %[[MLOAD_PTRS:.*]] = bitcast i32* %[[GEPA]] to <vscale x 4 x i32>*
+; CHECK-NEXT: %[[LOAD2:.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* %[[MLOAD_PTRS]], i32 4, <vscale x 4 x i1> %[[MASK]]
+; CHECK-NEXT: %[[FADD:.*]] = add <vscale x 4 x i32> %[[LOAD1]], %[[LOAD2]]
+; CHECK-NEXT: %[[MSTORE_PTRS:.*]] = bitcast i32* %[[GEPA]] to <vscale x 4 x i32>*
+; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> %[[FADD]], <vscale x 4 x i32>* %[[MSTORE_PTRS]], i32 4, <vscale x 4 x i1> %[[MASK]])
entry:
br label %for.body
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK-VF4IC1
; RUN: opt -loop-vectorize -force-vector-interleave=4 -force-vector-width=4 -S < %s | FileCheck %s --check-prefix=CHECK-VF4IC4
target triple = "aarch64-linux-gnu"
define i32 @select_const_i32_from_icmp(i32* nocapture readonly %v, i64 %n) #0 {
-; CHECK-VF4IC1-LABEL: @select_const_i32_from_icmp(
-; CHECK-VF4IC1-NEXT: entry:
-; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-VF4IC1: vector.ph:
-; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-VF4IC1-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-VF4IC1: vector.body:
-; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-VF4IC1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[V:%.*]], i64 [[TMP4]]
-; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
-; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <vscale x 4 x i32>*
-; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[TMP7]], align 4
-; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC1-NEXT: [[TMP9]] = select <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC1-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC1-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4
-; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
-; CHECK-VF4IC1-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF4IC1-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK-VF4IC1: middle.block:
-; CHECK-VF4IC1-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne <vscale x 4 x i32> [[TMP9]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC1-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[RDX_SELECT_CMP]])
-; CHECK-VF4IC1-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP13]], i32 7, i32 3
-; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-VF4IC1: scalar.ph:
-; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 3, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF4IC1-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-VF4IC1: for.body:
-; CHECK-VF4IC1-NEXT: [[TMP14:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[TMP20:%.*]], [[FOR_BODY]] ]
-; CHECK-VF4IC1-NEXT: [[TMP15:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ]
-; CHECK-VF4IC1-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[V]], i64 [[TMP14]]
-; CHECK-VF4IC1-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4
-; CHECK-VF4IC1-NEXT: [[TMP18:%.*]] = icmp eq i32 [[TMP17]], 3
-; CHECK-VF4IC1-NEXT: [[TMP19]] = select i1 [[TMP18]], i32 [[TMP15]], i32 7
-; CHECK-VF4IC1-NEXT: [[TMP20]] = add nuw nsw i64 [[TMP14]], 1
-; CHECK-VF4IC1-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], [[N]]
-; CHECK-VF4IC1-NEXT: br i1 [[TMP21]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK-VF4IC1: exit:
-; CHECK-VF4IC1-NEXT: [[DOTLCSSA:%.*]] = phi i32 [ [[TMP19]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF4IC1-NEXT: ret i32 [[DOTLCSSA]]
-;
-; CHECK-VF4IC4-LABEL: @select_const_i32_from_icmp(
-; CHECK-VF4IC4-NEXT: entry:
-; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-VF4IC4: vector.ph:
-; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
-; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-VF4IC4-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-VF4IC4: vector.body:
-; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP42:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP43:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[VEC_PHI2:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP44:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP45:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-VF4IC4-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
-; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 8
-; CHECK-VF4IC4-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 0
-; CHECK-VF4IC4-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 1
-; CHECK-VF4IC4-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], [[TMP13]]
-; CHECK-VF4IC4-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC4-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 12
-; CHECK-VF4IC4-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 0
-; CHECK-VF4IC4-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 1
-; CHECK-VF4IC4-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], [[TMP18]]
-; CHECK-VF4IC4-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[V:%.*]], i64 [[TMP4]]
-; CHECK-VF4IC4-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[V]], i64 [[TMP9]]
-; CHECK-VF4IC4-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[V]], i64 [[TMP14]]
-; CHECK-VF4IC4-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[V]], i64 [[TMP19]]
-; CHECK-VF4IC4-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 0
-; CHECK-VF4IC4-NEXT: [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <vscale x 4 x i32>*
-; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[TMP25]], align 4
-; CHECK-VF4IC4-NEXT: [[TMP26:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4IC4-NEXT: [[TMP27:%.*]] = mul i32 [[TMP26]], 4
-; CHECK-VF4IC4-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 [[TMP27]]
-; CHECK-VF4IC4-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <vscale x 4 x i32>*
-; CHECK-VF4IC4-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[TMP29]], align 4
-; CHECK-VF4IC4-NEXT: [[TMP30:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4IC4-NEXT: [[TMP31:%.*]] = mul i32 [[TMP30]], 8
-; CHECK-VF4IC4-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 [[TMP31]]
-; CHECK-VF4IC4-NEXT: [[TMP33:%.*]] = bitcast i32* [[TMP32]] to <vscale x 4 x i32>*
-; CHECK-VF4IC4-NEXT: [[WIDE_LOAD5:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[TMP33]], align 4
-; CHECK-VF4IC4-NEXT: [[TMP34:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4IC4-NEXT: [[TMP35:%.*]] = mul i32 [[TMP34]], 12
-; CHECK-VF4IC4-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 [[TMP35]]
-; CHECK-VF4IC4-NEXT: [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <vscale x 4 x i32>*
-; CHECK-VF4IC4-NEXT: [[WIDE_LOAD6:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[TMP37]], align 4
-; CHECK-VF4IC4-NEXT: [[TMP38:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT: [[TMP39:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_LOAD4]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT: [[TMP40:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_LOAD5]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT: [[TMP41:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_LOAD6]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT: [[TMP42]] = select <vscale x 4 x i1> [[TMP38]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT: [[TMP43]] = select <vscale x 4 x i1> [[TMP39]], <vscale x 4 x i32> [[VEC_PHI1]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT: [[TMP44]] = select <vscale x 4 x i1> [[TMP40]], <vscale x 4 x i32> [[VEC_PHI2]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT: [[TMP45]] = select <vscale x 4 x i1> [[TMP41]], <vscale x 4 x i32> [[VEC_PHI3]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT: [[TMP46:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC4-NEXT: [[TMP47:%.*]] = mul i64 [[TMP46]], 16
-; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP47]]
-; CHECK-VF4IC4-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF4IC4-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK-VF4IC4: middle.block:
-; CHECK-VF4IC4-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne <vscale x 4 x i32> [[TMP42]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT: [[RDX_SELECT:%.*]] = select <vscale x 4 x i1> [[RDX_SELECT_CMP]], <vscale x 4 x i32> [[TMP42]], <vscale x 4 x i32> [[TMP43]]
-; CHECK-VF4IC4-NEXT: [[RDX_SELECT_CMP7:%.*]] = icmp ne <vscale x 4 x i32> [[RDX_SELECT]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT: [[RDX_SELECT8:%.*]] = select <vscale x 4 x i1> [[RDX_SELECT_CMP7]], <vscale x 4 x i32> [[RDX_SELECT]], <vscale x 4 x i32> [[TMP44]]
-; CHECK-VF4IC4-NEXT: [[RDX_SELECT_CMP9:%.*]] = icmp ne <vscale x 4 x i32> [[RDX_SELECT8]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT: [[RDX_SELECT10:%.*]] = select <vscale x 4 x i1> [[RDX_SELECT_CMP9]], <vscale x 4 x i32> [[RDX_SELECT8]], <vscale x 4 x i32> [[TMP45]]
-; CHECK-VF4IC4-NEXT: [[RDX_SELECT_CMP11:%.*]] = icmp ne <vscale x 4 x i32> [[RDX_SELECT10]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT: [[TMP49:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[RDX_SELECT_CMP11]])
-; CHECK-VF4IC4-NEXT: [[RDX_SELECT12:%.*]] = select i1 [[TMP49]], i32 7, i32 3
-; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-VF4IC4: scalar.ph:
-; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 3, [[ENTRY]] ], [ [[RDX_SELECT12]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF4IC4-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-VF4IC4: for.body:
-; CHECK-VF4IC4-NEXT: [[TMP50:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[TMP56:%.*]], [[FOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[TMP51:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP55:%.*]], [[FOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[TMP52:%.*]] = getelementptr inbounds i32, i32* [[V]], i64 [[TMP50]]
-; CHECK-VF4IC4-NEXT: [[TMP53:%.*]] = load i32, i32* [[TMP52]], align 4
-; CHECK-VF4IC4-NEXT: [[TMP54:%.*]] = icmp eq i32 [[TMP53]], 3
-; CHECK-VF4IC4-NEXT: [[TMP55]] = select i1 [[TMP54]], i32 [[TMP51]], i32 7
-; CHECK-VF4IC4-NEXT: [[TMP56]] = add nuw nsw i64 [[TMP50]], 1
-; CHECK-VF4IC4-NEXT: [[TMP57:%.*]] = icmp eq i64 [[TMP56]], [[N]]
-; CHECK-VF4IC4-NEXT: br i1 [[TMP57]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK-VF4IC4: exit:
-; CHECK-VF4IC4-NEXT: [[DOTLCSSA:%.*]] = phi i32 [ [[TMP55]], [[FOR_BODY]] ], [ [[RDX_SELECT12]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF4IC4-NEXT: ret i32 [[DOTLCSSA]]
-;
-
+; CHECK-VF4IC1-LABEL: @select_const_i32_from_icmp
+; CHECK-VF4IC1: vector.body:
+; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
+; CHECK-VF4IC1: [[VEC_LOAD:%.*]] = load <vscale x 4 x i32>
+; CHECK-VF4IC1-NEXT: [[VEC_ICMP:%.*]] = icmp eq <vscale x 4 x i32> [[VEC_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = select <vscale x 4 x i1> [[VEC_ICMP]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC1: middle.block:
+; CHECK-VF4IC1-NEXT: [[FIN_ICMP:%.*]] = icmp ne <vscale x 4 x i32> [[VEC_SEL]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[FIN_ICMP]])
+; CHECK-VF4IC1-NEXT: {{.*}} = select i1 [[OR_RDX]], i32 7, i32 3
+
+; CHECK-VF4IC4-LABEL: @select_const_i32_from_icmp
+; CHECK-VF4IC4: vector.body:
+; CHECK-VF4IC4: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %vector.ph ], [ [[VEC_SEL1:%.*]], %vector.body ]
+; CHECK-VF4IC4: [[VEC_PHI2:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %vector.ph ], [ [[VEC_SEL2:%.*]], %vector.body ]
+; CHECK-VF4IC4: [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %vector.ph ], [ [[VEC_SEL3:%.*]], %vector.body ]
+; CHECK-VF4IC4: [[VEC_PHI4:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %vector.ph ], [ [[VEC_SEL4:%.*]], %vector.body ]
+; CHECK-VF4IC4: [[VEC_ICMP1:%.*]] = icmp eq <vscale x 4 x i32> {{.*}}, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4-NEXT: [[VEC_ICMP2:%.*]] = icmp eq <vscale x 4 x i32> {{.*}}, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4-NEXT: [[VEC_ICMP3:%.*]] = icmp eq <vscale x 4 x i32> {{.*}}, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4-NEXT: [[VEC_ICMP4:%.*]] = icmp eq <vscale x 4 x i32> {{.*}}, shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4-NEXT: [[VEC_SEL1]] = select <vscale x 4 x i1> [[VEC_ICMP1]], <vscale x 4 x i32> [[VEC_PHI1]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4-NEXT: [[VEC_SEL2]] = select <vscale x 4 x i1> [[VEC_ICMP2]], <vscale x 4 x i32> [[VEC_PHI2]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4-NEXT: [[VEC_SEL3]] = select <vscale x 4 x i1> [[VEC_ICMP3]], <vscale x 4 x i32> [[VEC_PHI3]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4-NEXT: [[VEC_SEL4]] = select <vscale x 4 x i1> [[VEC_ICMP4]], <vscale x 4 x i32> [[VEC_PHI4]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 7, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4: middle.block:
+; CHECK-VF4IC4-NEXT: [[VEC_ICMP5:%.*]] = icmp ne <vscale x 4 x i32> [[VEC_SEL1]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4-NEXT: [[VEC_SEL5:%.*]] = select <vscale x 4 x i1> [[VEC_ICMP5]], <vscale x 4 x i32> [[VEC_SEL1]], <vscale x 4 x i32> [[VEC_SEL2]]
+; CHECK-VF4IC4-NEXT: [[VEC_ICMP6:%.*]] = icmp ne <vscale x 4 x i32> [[VEC_SEL5]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4-NEXT: [[VEC_SEL6:%.*]] = select <vscale x 4 x i1> [[VEC_ICMP6]], <vscale x 4 x i32> [[VEC_SEL5]], <vscale x 4 x i32> [[VEC_SEL3]]
+; CHECK-VF4IC4-NEXT: [[VEC_ICMP7:%.*]] = icmp ne <vscale x 4 x i32> [[VEC_SEL6]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4-NEXT: [[VEC_SEL7:%.*]] = select <vscale x 4 x i1> [[VEC_ICMP7]], <vscale x 4 x i32> [[VEC_SEL6]], <vscale x 4 x i32> [[VEC_SEL4]]
+; CHECK-VF4IC4-NEXT: [[FIN_ICMP:%.*]] = icmp ne <vscale x 4 x i32> [[VEC_SEL7]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC4-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[FIN_ICMP]])
+; CHECK-VF4IC4-NEXT: {{.*}} = select i1 [[OR_RDX]], i32 7, i32 3
entry:
br label %for.body
}
define i32 @select_i32_from_icmp(i32* nocapture readonly %v, i32 %a, i32 %b, i64 %n) #0 {
-; CHECK-VF4IC1-LABEL: @select_i32_from_icmp(
-; CHECK-VF4IC1-NEXT: entry:
-; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-VF4IC1: vector.ph:
-; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-VF4IC1-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[A:%.*]], i32 0
-; CHECK-VF4IC1-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[MINMAX_IDENT_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[B:%.*]], i32 0
-; CHECK-VF4IC1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-VF4IC1-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-VF4IC1: vector.body:
-; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-VF4IC1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[V:%.*]], i64 [[TMP4]]
-; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
-; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <vscale x 4 x i32>*
-; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[TMP7]], align 4
-; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC1-NEXT: [[TMP9]] = select <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[BROADCAST_SPLAT]]
-; CHECK-VF4IC1-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC1-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4
-; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
-; CHECK-VF4IC1-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF4IC1-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK-VF4IC1: middle.block:
-; CHECK-VF4IC1-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[A]], i32 0
-; CHECK-VF4IC1-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-VF4IC1-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne <vscale x 4 x i32> [[TMP9]], [[DOTSPLAT]]
-; CHECK-VF4IC1-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[RDX_SELECT_CMP]])
-; CHECK-VF4IC1-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP13]], i32 [[B]], i32 [[A]]
-; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-VF4IC1: scalar.ph:
-; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[A]], [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF4IC1-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-VF4IC1: for.body:
-; CHECK-VF4IC1-NEXT: [[TMP14:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[TMP20:%.*]], [[FOR_BODY]] ]
-; CHECK-VF4IC1-NEXT: [[TMP15:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ]
-; CHECK-VF4IC1-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[V]], i64 [[TMP14]]
-; CHECK-VF4IC1-NEXT: [[TMP17:%.*]] = load i32, i32* [[TMP16]], align 4
-; CHECK-VF4IC1-NEXT: [[TMP18:%.*]] = icmp eq i32 [[TMP17]], 3
-; CHECK-VF4IC1-NEXT: [[TMP19]] = select i1 [[TMP18]], i32 [[TMP15]], i32 [[B]]
-; CHECK-VF4IC1-NEXT: [[TMP20]] = add nuw nsw i64 [[TMP14]], 1
-; CHECK-VF4IC1-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], [[N]]
-; CHECK-VF4IC1-NEXT: br i1 [[TMP21]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK-VF4IC1: exit:
-; CHECK-VF4IC1-NEXT: [[DOTLCSSA:%.*]] = phi i32 [ [[TMP19]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF4IC1-NEXT: ret i32 [[DOTLCSSA]]
-;
-; CHECK-VF4IC4-LABEL: @select_i32_from_icmp(
-; CHECK-VF4IC4-NEXT: entry:
-; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-VF4IC4: vector.ph:
-; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
-; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-VF4IC4-NEXT: [[MINMAX_IDENT_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[A:%.*]], i32 0
-; CHECK-VF4IC4-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[MINMAX_IDENT_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[B:%.*]], i32 0
-; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[B]], i32 0
-; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT7]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[B]], i32 0
-; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT9]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLATINSERT11:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[B]], i32 0
-; CHECK-VF4IC4-NEXT: [[BROADCAST_SPLAT12:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT11]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-VF4IC4-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-VF4IC4: vector.body:
-; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP42:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP43:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[VEC_PHI2:%.*]] = phi <vscale x 4 x i32> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP44:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ [[MINMAX_IDENT_SPLAT]], [[VECTOR_PH]] ], [ [[TMP45:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-VF4IC4-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
-; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 8
-; CHECK-VF4IC4-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 0
-; CHECK-VF4IC4-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 1
-; CHECK-VF4IC4-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], [[TMP13]]
-; CHECK-VF4IC4-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC4-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 12
-; CHECK-VF4IC4-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 0
-; CHECK-VF4IC4-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 1
-; CHECK-VF4IC4-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], [[TMP18]]
-; CHECK-VF4IC4-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[V:%.*]], i64 [[TMP4]]
-; CHECK-VF4IC4-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[V]], i64 [[TMP9]]
-; CHECK-VF4IC4-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[V]], i64 [[TMP14]]
-; CHECK-VF4IC4-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[V]], i64 [[TMP19]]
-; CHECK-VF4IC4-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 0
-; CHECK-VF4IC4-NEXT: [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <vscale x 4 x i32>*
-; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[TMP25]], align 4
-; CHECK-VF4IC4-NEXT: [[TMP26:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4IC4-NEXT: [[TMP27:%.*]] = mul i32 [[TMP26]], 4
-; CHECK-VF4IC4-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 [[TMP27]]
-; CHECK-VF4IC4-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <vscale x 4 x i32>*
-; CHECK-VF4IC4-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[TMP29]], align 4
-; CHECK-VF4IC4-NEXT: [[TMP30:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4IC4-NEXT: [[TMP31:%.*]] = mul i32 [[TMP30]], 8
-; CHECK-VF4IC4-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 [[TMP31]]
-; CHECK-VF4IC4-NEXT: [[TMP33:%.*]] = bitcast i32* [[TMP32]] to <vscale x 4 x i32>*
-; CHECK-VF4IC4-NEXT: [[WIDE_LOAD5:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[TMP33]], align 4
-; CHECK-VF4IC4-NEXT: [[TMP34:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4IC4-NEXT: [[TMP35:%.*]] = mul i32 [[TMP34]], 12
-; CHECK-VF4IC4-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 [[TMP35]]
-; CHECK-VF4IC4-NEXT: [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <vscale x 4 x i32>*
-; CHECK-VF4IC4-NEXT: [[WIDE_LOAD6:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[TMP37]], align 4
-; CHECK-VF4IC4-NEXT: [[TMP38:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT: [[TMP39:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_LOAD4]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT: [[TMP40:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_LOAD5]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT: [[TMP41:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_LOAD6]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT: [[TMP42]] = select <vscale x 4 x i1> [[TMP38]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[BROADCAST_SPLAT]]
-; CHECK-VF4IC4-NEXT: [[TMP43]] = select <vscale x 4 x i1> [[TMP39]], <vscale x 4 x i32> [[VEC_PHI1]], <vscale x 4 x i32> [[BROADCAST_SPLAT8]]
-; CHECK-VF4IC4-NEXT: [[TMP44]] = select <vscale x 4 x i1> [[TMP40]], <vscale x 4 x i32> [[VEC_PHI2]], <vscale x 4 x i32> [[BROADCAST_SPLAT10]]
-; CHECK-VF4IC4-NEXT: [[TMP45]] = select <vscale x 4 x i1> [[TMP41]], <vscale x 4 x i32> [[VEC_PHI3]], <vscale x 4 x i32> [[BROADCAST_SPLAT12]]
-; CHECK-VF4IC4-NEXT: [[TMP46:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC4-NEXT: [[TMP47:%.*]] = mul i64 [[TMP46]], 16
-; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP47]]
-; CHECK-VF4IC4-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF4IC4-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK-VF4IC4: middle.block:
-; CHECK-VF4IC4-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[A]], i32 0
-; CHECK-VF4IC4-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-VF4IC4-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne <vscale x 4 x i32> [[TMP42]], [[DOTSPLAT]]
-; CHECK-VF4IC4-NEXT: [[RDX_SELECT:%.*]] = select <vscale x 4 x i1> [[RDX_SELECT_CMP]], <vscale x 4 x i32> [[TMP42]], <vscale x 4 x i32> [[TMP43]]
-; CHECK-VF4IC4-NEXT: [[DOTSPLATINSERT13:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[A]], i32 0
-; CHECK-VF4IC4-NEXT: [[DOTSPLAT14:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT13]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-VF4IC4-NEXT: [[RDX_SELECT_CMP15:%.*]] = icmp ne <vscale x 4 x i32> [[RDX_SELECT]], [[DOTSPLAT14]]
-; CHECK-VF4IC4-NEXT: [[RDX_SELECT16:%.*]] = select <vscale x 4 x i1> [[RDX_SELECT_CMP15]], <vscale x 4 x i32> [[RDX_SELECT]], <vscale x 4 x i32> [[TMP44]]
-; CHECK-VF4IC4-NEXT: [[DOTSPLATINSERT17:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[A]], i32 0
-; CHECK-VF4IC4-NEXT: [[DOTSPLAT18:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT17]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-VF4IC4-NEXT: [[RDX_SELECT_CMP19:%.*]] = icmp ne <vscale x 4 x i32> [[RDX_SELECT16]], [[DOTSPLAT18]]
-; CHECK-VF4IC4-NEXT: [[RDX_SELECT20:%.*]] = select <vscale x 4 x i1> [[RDX_SELECT_CMP19]], <vscale x 4 x i32> [[RDX_SELECT16]], <vscale x 4 x i32> [[TMP45]]
-; CHECK-VF4IC4-NEXT: [[DOTSPLATINSERT21:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[A]], i32 0
-; CHECK-VF4IC4-NEXT: [[DOTSPLAT22:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT21]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-VF4IC4-NEXT: [[RDX_SELECT_CMP23:%.*]] = icmp ne <vscale x 4 x i32> [[RDX_SELECT20]], [[DOTSPLAT22]]
-; CHECK-VF4IC4-NEXT: [[TMP49:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[RDX_SELECT_CMP23]])
-; CHECK-VF4IC4-NEXT: [[RDX_SELECT24:%.*]] = select i1 [[TMP49]], i32 [[B]], i32 [[A]]
-; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-VF4IC4: scalar.ph:
-; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[A]], [[ENTRY]] ], [ [[RDX_SELECT24]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF4IC4-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-VF4IC4: for.body:
-; CHECK-VF4IC4-NEXT: [[TMP50:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[TMP56:%.*]], [[FOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[TMP51:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP55:%.*]], [[FOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[TMP52:%.*]] = getelementptr inbounds i32, i32* [[V]], i64 [[TMP50]]
-; CHECK-VF4IC4-NEXT: [[TMP53:%.*]] = load i32, i32* [[TMP52]], align 4
-; CHECK-VF4IC4-NEXT: [[TMP54:%.*]] = icmp eq i32 [[TMP53]], 3
-; CHECK-VF4IC4-NEXT: [[TMP55]] = select i1 [[TMP54]], i32 [[TMP51]], i32 [[B]]
-; CHECK-VF4IC4-NEXT: [[TMP56]] = add nuw nsw i64 [[TMP50]], 1
-; CHECK-VF4IC4-NEXT: [[TMP57:%.*]] = icmp eq i64 [[TMP56]], [[N]]
-; CHECK-VF4IC4-NEXT: br i1 [[TMP57]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK-VF4IC4: exit:
-; CHECK-VF4IC4-NEXT: [[DOTLCSSA:%.*]] = phi i32 [ [[TMP55]], [[FOR_BODY]] ], [ [[RDX_SELECT24]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF4IC4-NEXT: ret i32 [[DOTLCSSA]]
-;
-
+; CHECK-VF4IC1-LABEL: @select_i32_from_icmp
+; CHECK-VF4IC1: vector.ph:
+; CHECK-VF4IC1: [[TMP1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 %a, i32 0
+; CHECK-VF4IC1-NEXT: [[SPLAT_OF_A:%.*]] = shufflevector <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
+; CHECK-VF4IC1-NEXT: [[SPLAT_OF_B:%.*]] = shufflevector <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-VF4IC1: vector.body:
+; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ [[SPLAT_OF_A]], %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
+; CHECK-VF4IC1: [[VEC_LOAD:%.*]] = load <vscale x 4 x i32>
+; CHECK-VF4IC1-NEXT: [[VEC_ICMP:%.*]] = icmp eq <vscale x 4 x i32> [[VEC_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = select <vscale x 4 x i1> [[VEC_ICMP]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[SPLAT_OF_B]]
+; CHECK-VF4IC1: middle.block:
+; CHECK-VF4IC1-NEXT: [[FIN_INS:%.*]] = insertelement <vscale x 4 x i32> poison, i32 %a, i32 0
+; CHECK-VF4IC1-NEXT: [[FIN_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[FIN_INS]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-VF4IC1-NEXT: [[FIN_CMP:%.*]] = icmp ne <vscale x 4 x i32> [[VEC_SEL]], [[FIN_SPLAT]]
+; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[FIN_CMP]])
+; CHECK-VF4IC1-NEXT: {{.*}} = select i1 [[OR_RDX]], i32 %b, i32 %a
+
+; CHECK-VF4IC4-LABEL: @select_i32_from_icmp
+; CHECK-VF4IC4: vector.body:
entry:
br label %for.body
}
define i32 @select_const_i32_from_fcmp(float* nocapture readonly %v, i64 %n) #0 {
-; CHECK-VF4IC1-LABEL: @select_const_i32_from_fcmp(
-; CHECK-VF4IC1-NEXT: entry:
-; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-VF4IC1: vector.ph:
-; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-VF4IC1-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-VF4IC1: vector.body:
-; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-VF4IC1-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[V:%.*]], i64 [[TMP4]]
-; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0
-; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <vscale x 4 x float>*
-; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[TMP7]], align 4
-; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = fcmp fast ueq <vscale x 4 x float> [[WIDE_LOAD]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC1-NEXT: [[TMP9]] = select <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC1-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC1-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4
-; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
-; CHECK-VF4IC1-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF4IC1-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK-VF4IC1: middle.block:
-; CHECK-VF4IC1-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne <vscale x 4 x i32> [[TMP9]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC1-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[RDX_SELECT_CMP]])
-; CHECK-VF4IC1-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP13]], i32 1, i32 2
-; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-VF4IC1: scalar.ph:
-; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF4IC1-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-VF4IC1: for.body:
-; CHECK-VF4IC1-NEXT: [[TMP14:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[TMP20:%.*]], [[FOR_BODY]] ]
-; CHECK-VF4IC1-NEXT: [[TMP15:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ]
-; CHECK-VF4IC1-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[V]], i64 [[TMP14]]
-; CHECK-VF4IC1-NEXT: [[TMP17:%.*]] = load float, float* [[TMP16]], align 4
-; CHECK-VF4IC1-NEXT: [[TMP18:%.*]] = fcmp fast ueq float [[TMP17]], 3.000000e+00
-; CHECK-VF4IC1-NEXT: [[TMP19]] = select i1 [[TMP18]], i32 [[TMP15]], i32 1
-; CHECK-VF4IC1-NEXT: [[TMP20]] = add nuw nsw i64 [[TMP14]], 1
-; CHECK-VF4IC1-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], [[N]]
-; CHECK-VF4IC1-NEXT: br i1 [[TMP21]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK-VF4IC1: exit:
-; CHECK-VF4IC1-NEXT: [[DOTLCSSA:%.*]] = phi i32 [ [[TMP19]], [[FOR_BODY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF4IC1-NEXT: ret i32 [[DOTLCSSA]]
-;
-; CHECK-VF4IC4-LABEL: @select_const_i32_from_fcmp(
-; CHECK-VF4IC4-NEXT: entry:
-; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-VF4IC4: vector.ph:
-; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
-; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-VF4IC4-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-VF4IC4: vector.body:
-; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP42:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP43:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[VEC_PHI2:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP44:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP45:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-VF4IC4-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
-; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 8
-; CHECK-VF4IC4-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 0
-; CHECK-VF4IC4-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 1
-; CHECK-VF4IC4-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], [[TMP13]]
-; CHECK-VF4IC4-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC4-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 12
-; CHECK-VF4IC4-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 0
-; CHECK-VF4IC4-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 1
-; CHECK-VF4IC4-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], [[TMP18]]
-; CHECK-VF4IC4-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[V:%.*]], i64 [[TMP4]]
-; CHECK-VF4IC4-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[V]], i64 [[TMP9]]
-; CHECK-VF4IC4-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[V]], i64 [[TMP14]]
-; CHECK-VF4IC4-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, float* [[V]], i64 [[TMP19]]
-; CHECK-VF4IC4-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 0
-; CHECK-VF4IC4-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to <vscale x 4 x float>*
-; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[TMP25]], align 4
-; CHECK-VF4IC4-NEXT: [[TMP26:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4IC4-NEXT: [[TMP27:%.*]] = mul i32 [[TMP26]], 4
-; CHECK-VF4IC4-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 [[TMP27]]
-; CHECK-VF4IC4-NEXT: [[TMP29:%.*]] = bitcast float* [[TMP28]] to <vscale x 4 x float>*
-; CHECK-VF4IC4-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[TMP29]], align 4
-; CHECK-VF4IC4-NEXT: [[TMP30:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4IC4-NEXT: [[TMP31:%.*]] = mul i32 [[TMP30]], 8
-; CHECK-VF4IC4-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 [[TMP31]]
-; CHECK-VF4IC4-NEXT: [[TMP33:%.*]] = bitcast float* [[TMP32]] to <vscale x 4 x float>*
-; CHECK-VF4IC4-NEXT: [[WIDE_LOAD5:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[TMP33]], align 4
-; CHECK-VF4IC4-NEXT: [[TMP34:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4IC4-NEXT: [[TMP35:%.*]] = mul i32 [[TMP34]], 12
-; CHECK-VF4IC4-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 [[TMP35]]
-; CHECK-VF4IC4-NEXT: [[TMP37:%.*]] = bitcast float* [[TMP36]] to <vscale x 4 x float>*
-; CHECK-VF4IC4-NEXT: [[WIDE_LOAD6:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[TMP37]], align 4
-; CHECK-VF4IC4-NEXT: [[TMP38:%.*]] = fcmp fast ueq <vscale x 4 x float> [[WIDE_LOAD]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT: [[TMP39:%.*]] = fcmp fast ueq <vscale x 4 x float> [[WIDE_LOAD4]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT: [[TMP40:%.*]] = fcmp fast ueq <vscale x 4 x float> [[WIDE_LOAD5]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT: [[TMP41:%.*]] = fcmp fast ueq <vscale x 4 x float> [[WIDE_LOAD6]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT: [[TMP42]] = select <vscale x 4 x i1> [[TMP38]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT: [[TMP43]] = select <vscale x 4 x i1> [[TMP39]], <vscale x 4 x i32> [[VEC_PHI1]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT: [[TMP44]] = select <vscale x 4 x i1> [[TMP40]], <vscale x 4 x i32> [[VEC_PHI2]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT: [[TMP45]] = select <vscale x 4 x i1> [[TMP41]], <vscale x 4 x i32> [[VEC_PHI3]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT: [[TMP46:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC4-NEXT: [[TMP47:%.*]] = mul i64 [[TMP46]], 16
-; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP47]]
-; CHECK-VF4IC4-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF4IC4-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK-VF4IC4: middle.block:
-; CHECK-VF4IC4-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne <vscale x 4 x i32> [[TMP42]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT: [[RDX_SELECT:%.*]] = select <vscale x 4 x i1> [[RDX_SELECT_CMP]], <vscale x 4 x i32> [[TMP42]], <vscale x 4 x i32> [[TMP43]]
-; CHECK-VF4IC4-NEXT: [[RDX_SELECT_CMP7:%.*]] = icmp ne <vscale x 4 x i32> [[RDX_SELECT]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT: [[RDX_SELECT8:%.*]] = select <vscale x 4 x i1> [[RDX_SELECT_CMP7]], <vscale x 4 x i32> [[RDX_SELECT]], <vscale x 4 x i32> [[TMP44]]
-; CHECK-VF4IC4-NEXT: [[RDX_SELECT_CMP9:%.*]] = icmp ne <vscale x 4 x i32> [[RDX_SELECT8]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT: [[RDX_SELECT10:%.*]] = select <vscale x 4 x i1> [[RDX_SELECT_CMP9]], <vscale x 4 x i32> [[RDX_SELECT8]], <vscale x 4 x i32> [[TMP45]]
-; CHECK-VF4IC4-NEXT: [[RDX_SELECT_CMP11:%.*]] = icmp ne <vscale x 4 x i32> [[RDX_SELECT10]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT: [[TMP49:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[RDX_SELECT_CMP11]])
-; CHECK-VF4IC4-NEXT: [[RDX_SELECT12:%.*]] = select i1 [[TMP49]], i32 1, i32 2
-; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-VF4IC4: scalar.ph:
-; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, [[ENTRY]] ], [ [[RDX_SELECT12]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF4IC4-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-VF4IC4: for.body:
-; CHECK-VF4IC4-NEXT: [[TMP50:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[TMP56:%.*]], [[FOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[TMP51:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP55:%.*]], [[FOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[TMP52:%.*]] = getelementptr inbounds float, float* [[V]], i64 [[TMP50]]
-; CHECK-VF4IC4-NEXT: [[TMP53:%.*]] = load float, float* [[TMP52]], align 4
-; CHECK-VF4IC4-NEXT: [[TMP54:%.*]] = fcmp fast ueq float [[TMP53]], 3.000000e+00
-; CHECK-VF4IC4-NEXT: [[TMP55]] = select i1 [[TMP54]], i32 [[TMP51]], i32 1
-; CHECK-VF4IC4-NEXT: [[TMP56]] = add nuw nsw i64 [[TMP50]], 1
-; CHECK-VF4IC4-NEXT: [[TMP57:%.*]] = icmp eq i64 [[TMP56]], [[N]]
-; CHECK-VF4IC4-NEXT: br i1 [[TMP57]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK-VF4IC4: exit:
-; CHECK-VF4IC4-NEXT: [[DOTLCSSA:%.*]] = phi i32 [ [[TMP55]], [[FOR_BODY]] ], [ [[RDX_SELECT12]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF4IC4-NEXT: ret i32 [[DOTLCSSA]]
-;
-
+; CHECK-VF4IC1-LABEL: @select_const_i32_from_fcmp
+; CHECK-VF4IC1: vector.body:
+; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
+; CHECK-VF4IC1: [[VEC_LOAD:%.*]] = load <vscale x 4 x float>
+; CHECK-VF4IC1-NEXT: [[VEC_ICMP:%.*]] = fcmp fast ueq <vscale x 4 x float> [[VEC_LOAD]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 3.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC1-NEXT: [[VEC_SEL]] = select <vscale x 4 x i1> [[VEC_ICMP]], <vscale x 4 x i32> [[VEC_PHI]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC1: middle.block:
+; CHECK-VF4IC1-NEXT: [[FIN_ICMP:%.*]] = icmp ne <vscale x 4 x i32> [[VEC_SEL]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[FIN_ICMP]])
+; CHECK-VF4IC1-NEXT: {{.*}} = select i1 [[OR_RDX]], i32 1, i32 2
+
+; CHECK-VF4IC4-LABEL: @select_const_i32_from_fcmp
+; CHECK-VF4IC4: vector.body:
entry:
br label %for.body
}
define float @select_const_f32_from_icmp(i32* nocapture readonly %v, i64 %n) #0 {
-; CHECK-VF4IC1-LABEL: @select_const_f32_from_icmp(
-; CHECK-VF4IC1-NEXT: entry:
-; CHECK-VF4IC1-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-VF4IC1: for.body:
-; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP6:%.*]], [[FOR_BODY]] ]
-; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = phi fast float [ 3.000000e+00, [[ENTRY]] ], [ [[TMP5:%.*]], [[FOR_BODY]] ]
-; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[V:%.*]], i64 [[TMP0]]
-; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
-; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = icmp eq i32 [[TMP3]], 3
-; CHECK-VF4IC1-NEXT: [[TMP5]] = select fast i1 [[TMP4]], float [[TMP1]], float 7.000000e+00
-; CHECK-VF4IC1-NEXT: [[TMP6]] = add nuw nsw i64 [[TMP0]], 1
-; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = icmp eq i64 [[TMP6]], [[N:%.*]]
-; CHECK-VF4IC1-NEXT: br i1 [[TMP7]], label [[EXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK-VF4IC1: exit:
-; CHECK-VF4IC1-NEXT: [[DOTLCSSA:%.*]] = phi float [ [[TMP5]], [[FOR_BODY]] ]
-; CHECK-VF4IC1-NEXT: ret float [[DOTLCSSA]]
-;
-; CHECK-VF4IC4-LABEL: @select_const_f32_from_icmp(
-; CHECK-VF4IC4-NEXT: entry:
-; CHECK-VF4IC4-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-VF4IC4: for.body:
-; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP6:%.*]], [[FOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = phi fast float [ 3.000000e+00, [[ENTRY]] ], [ [[TMP5:%.*]], [[FOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[V:%.*]], i64 [[TMP0]]
-; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
-; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = icmp eq i32 [[TMP3]], 3
-; CHECK-VF4IC4-NEXT: [[TMP5]] = select fast i1 [[TMP4]], float [[TMP1]], float 7.000000e+00
-; CHECK-VF4IC4-NEXT: [[TMP6]] = add nuw nsw i64 [[TMP0]], 1
-; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = icmp eq i64 [[TMP6]], [[N:%.*]]
-; CHECK-VF4IC4-NEXT: br i1 [[TMP7]], label [[EXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK-VF4IC4: exit:
-; CHECK-VF4IC4-NEXT: [[DOTLCSSA:%.*]] = phi float [ [[TMP5]], [[FOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: ret float [[DOTLCSSA]]
-;
+; CHECK-VF4IC1-LABEL: @select_const_f32_from_icmp
+; CHECK-VF4IC1-NOT: vector.body
+; CHECK-VF4IC4-LABEL: @select_const_f32_from_icmp
+; CHECK-VF4IC4-NOT: vector.body
entry:
br label %for.body
}
define i32 @pred_select_const_i32_from_icmp(i32* noalias nocapture readonly %src1, i32* noalias nocapture readonly %src2, i64 %n) #0 {
-; CHECK-VF4IC1-LABEL: @pred_select_const_i32_from_icmp(
-; CHECK-VF4IC1-NEXT: entry:
-; CHECK-VF4IC1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC1-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-VF4IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-VF4IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-VF4IC1: vector.ph:
-; CHECK-VF4IC1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC1-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; CHECK-VF4IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-VF4IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-VF4IC1-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-VF4IC1: vector.body:
-; CHECK-VF4IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC1-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC1-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-VF4IC1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[SRC1:%.*]], i64 [[TMP4]]
-; CHECK-VF4IC1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
-; CHECK-VF4IC1-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <vscale x 4 x i32>*
-; CHECK-VF4IC1-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[TMP7]], align 4
-; CHECK-VF4IC1-NEXT: [[TMP8:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 35, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC1-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[SRC2:%.*]], i64 [[TMP4]]
-; CHECK-VF4IC1-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 0
-; CHECK-VF4IC1-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <vscale x 4 x i32>*
-; CHECK-VF4IC1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP11]], i32 4, <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i32> poison)
-; CHECK-VF4IC1-NEXT: [[TMP12:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC1-NEXT: [[TMP13:%.*]] = select <vscale x 4 x i1> [[TMP12]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> [[VEC_PHI]]
-; CHECK-VF4IC1-NEXT: [[TMP14:%.*]] = xor <vscale x 4 x i1> [[TMP8]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC1-NEXT: [[PREDPHI]] = select <vscale x 4 x i1> [[TMP8]], <vscale x 4 x i32> [[TMP13]], <vscale x 4 x i32> [[VEC_PHI]]
-; CHECK-VF4IC1-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC1-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4
-; CHECK-VF4IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]]
-; CHECK-VF4IC1-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF4IC1-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK-VF4IC1: middle.block:
-; CHECK-VF4IC1-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne <vscale x 4 x i32> [[PREDPHI]], zeroinitializer
-; CHECK-VF4IC1-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[RDX_SELECT_CMP]])
-; CHECK-VF4IC1-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP18]], i32 1, i32 0
-; CHECK-VF4IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-VF4IC1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-VF4IC1: scalar.ph:
-; CHECK-VF4IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-VF4IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF4IC1-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-VF4IC1: for.body:
-; CHECK-VF4IC1-NEXT: [[I_013:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-VF4IC1-NEXT: [[R_012:%.*]] = phi i32 [ [[R_1:%.*]], [[FOR_INC]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-VF4IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[SRC1]], i64 [[I_013]]
-; CHECK-VF4IC1-NEXT: [[TMP19:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-VF4IC1-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP19]], 35
-; CHECK-VF4IC1-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
-; CHECK-VF4IC1: if.then:
-; CHECK-VF4IC1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[SRC2]], i64 [[I_013]]
-; CHECK-VF4IC1-NEXT: [[TMP20:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
-; CHECK-VF4IC1-NEXT: [[CMP3:%.*]] = icmp eq i32 [[TMP20]], 2
-; CHECK-VF4IC1-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[CMP3]], i32 1, i32 [[R_012]]
-; CHECK-VF4IC1-NEXT: br label [[FOR_INC]]
-; CHECK-VF4IC1: for.inc:
-; CHECK-VF4IC1-NEXT: [[R_1]] = phi i32 [ [[R_012]], [[FOR_BODY]] ], [ [[SPEC_SELECT]], [[IF_THEN]] ]
-; CHECK-VF4IC1-NEXT: [[INC]] = add nuw nsw i64 [[I_013]], 1
-; CHECK-VF4IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-VF4IC1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK-VF4IC1: for.end.loopexit:
-; CHECK-VF4IC1-NEXT: [[R_1_LCSSA:%.*]] = phi i32 [ [[R_1]], [[FOR_INC]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF4IC1-NEXT: ret i32 [[R_1_LCSSA]]
-;
-; CHECK-VF4IC4-LABEL: @pred_select_const_i32_from_icmp(
-; CHECK-VF4IC4-NEXT: entry:
-; CHECK-VF4IC4-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC4-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-VF4IC4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-VF4IC4-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-VF4IC4: vector.ph:
-; CHECK-VF4IC4-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC4-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
-; CHECK-VF4IC4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-VF4IC4-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-VF4IC4-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-VF4IC4: vector.body:
-; CHECK-VF4IC4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI10:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[VEC_PHI2:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI11:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI12:%.*]], [[VECTOR_BODY]] ]
-; CHECK-VF4IC4-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-VF4IC4-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC4-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
-; CHECK-VF4IC4-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-VF4IC4-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-VF4IC4-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-VF4IC4-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC4-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 8
-; CHECK-VF4IC4-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], 0
-; CHECK-VF4IC4-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 1
-; CHECK-VF4IC4-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], [[TMP13]]
-; CHECK-VF4IC4-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC4-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 12
-; CHECK-VF4IC4-NEXT: [[TMP17:%.*]] = add i64 [[TMP16]], 0
-; CHECK-VF4IC4-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 1
-; CHECK-VF4IC4-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], [[TMP18]]
-; CHECK-VF4IC4-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[SRC1:%.*]], i64 [[TMP4]]
-; CHECK-VF4IC4-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[SRC1]], i64 [[TMP9]]
-; CHECK-VF4IC4-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[SRC1]], i64 [[TMP14]]
-; CHECK-VF4IC4-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[SRC1]], i64 [[TMP19]]
-; CHECK-VF4IC4-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 0
-; CHECK-VF4IC4-NEXT: [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <vscale x 4 x i32>*
-; CHECK-VF4IC4-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[TMP25]], align 4
-; CHECK-VF4IC4-NEXT: [[TMP26:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4IC4-NEXT: [[TMP27:%.*]] = mul i32 [[TMP26]], 4
-; CHECK-VF4IC4-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 [[TMP27]]
-; CHECK-VF4IC4-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <vscale x 4 x i32>*
-; CHECK-VF4IC4-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[TMP29]], align 4
-; CHECK-VF4IC4-NEXT: [[TMP30:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4IC4-NEXT: [[TMP31:%.*]] = mul i32 [[TMP30]], 8
-; CHECK-VF4IC4-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 [[TMP31]]
-; CHECK-VF4IC4-NEXT: [[TMP33:%.*]] = bitcast i32* [[TMP32]] to <vscale x 4 x i32>*
-; CHECK-VF4IC4-NEXT: [[WIDE_LOAD5:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[TMP33]], align 4
-; CHECK-VF4IC4-NEXT: [[TMP34:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4IC4-NEXT: [[TMP35:%.*]] = mul i32 [[TMP34]], 12
-; CHECK-VF4IC4-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 [[TMP35]]
-; CHECK-VF4IC4-NEXT: [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <vscale x 4 x i32>*
-; CHECK-VF4IC4-NEXT: [[WIDE_LOAD6:%.*]] = load <vscale x 4 x i32>, <vscale x 4 x i32>* [[TMP37]], align 4
-; CHECK-VF4IC4-NEXT: [[TMP38:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 35, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT: [[TMP39:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_LOAD4]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 35, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT: [[TMP40:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_LOAD5]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 35, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT: [[TMP41:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_LOAD6]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 35, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT: [[TMP42:%.*]] = getelementptr i32, i32* [[SRC2:%.*]], i64 [[TMP4]]
-; CHECK-VF4IC4-NEXT: [[TMP43:%.*]] = getelementptr i32, i32* [[SRC2]], i64 [[TMP9]]
-; CHECK-VF4IC4-NEXT: [[TMP44:%.*]] = getelementptr i32, i32* [[SRC2]], i64 [[TMP14]]
-; CHECK-VF4IC4-NEXT: [[TMP45:%.*]] = getelementptr i32, i32* [[SRC2]], i64 [[TMP19]]
-; CHECK-VF4IC4-NEXT: [[TMP46:%.*]] = getelementptr i32, i32* [[TMP42]], i32 0
-; CHECK-VF4IC4-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <vscale x 4 x i32>*
-; CHECK-VF4IC4-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP47]], i32 4, <vscale x 4 x i1> [[TMP38]], <vscale x 4 x i32> poison)
-; CHECK-VF4IC4-NEXT: [[TMP48:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4IC4-NEXT: [[TMP49:%.*]] = mul i32 [[TMP48]], 4
-; CHECK-VF4IC4-NEXT: [[TMP50:%.*]] = getelementptr i32, i32* [[TMP42]], i32 [[TMP49]]
-; CHECK-VF4IC4-NEXT: [[TMP51:%.*]] = bitcast i32* [[TMP50]] to <vscale x 4 x i32>*
-; CHECK-VF4IC4-NEXT: [[WIDE_MASKED_LOAD7:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP51]], i32 4, <vscale x 4 x i1> [[TMP39]], <vscale x 4 x i32> poison)
-; CHECK-VF4IC4-NEXT: [[TMP52:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4IC4-NEXT: [[TMP53:%.*]] = mul i32 [[TMP52]], 8
-; CHECK-VF4IC4-NEXT: [[TMP54:%.*]] = getelementptr i32, i32* [[TMP42]], i32 [[TMP53]]
-; CHECK-VF4IC4-NEXT: [[TMP55:%.*]] = bitcast i32* [[TMP54]] to <vscale x 4 x i32>*
-; CHECK-VF4IC4-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP55]], i32 4, <vscale x 4 x i1> [[TMP40]], <vscale x 4 x i32> poison)
-; CHECK-VF4IC4-NEXT: [[TMP56:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-VF4IC4-NEXT: [[TMP57:%.*]] = mul i32 [[TMP56]], 12
-; CHECK-VF4IC4-NEXT: [[TMP58:%.*]] = getelementptr i32, i32* [[TMP42]], i32 [[TMP57]]
-; CHECK-VF4IC4-NEXT: [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <vscale x 4 x i32>*
-; CHECK-VF4IC4-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP59]], i32 4, <vscale x 4 x i1> [[TMP41]], <vscale x 4 x i32> poison)
-; CHECK-VF4IC4-NEXT: [[TMP60:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT: [[TMP61:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_MASKED_LOAD7]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT: [[TMP62:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_MASKED_LOAD8]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT: [[TMP63:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_MASKED_LOAD9]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT: [[TMP64:%.*]] = select <vscale x 4 x i1> [[TMP60]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> [[VEC_PHI]]
-; CHECK-VF4IC4-NEXT: [[TMP65:%.*]] = select <vscale x 4 x i1> [[TMP61]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> [[VEC_PHI1]]
-; CHECK-VF4IC4-NEXT: [[TMP66:%.*]] = select <vscale x 4 x i1> [[TMP62]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> [[VEC_PHI2]]
-; CHECK-VF4IC4-NEXT: [[TMP67:%.*]] = select <vscale x 4 x i1> [[TMP63]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> [[VEC_PHI3]]
-; CHECK-VF4IC4-NEXT: [[TMP68:%.*]] = xor <vscale x 4 x i1> [[TMP38]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT: [[TMP69:%.*]] = xor <vscale x 4 x i1> [[TMP39]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT: [[TMP70:%.*]] = xor <vscale x 4 x i1> [[TMP40]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT: [[TMP71:%.*]] = xor <vscale x 4 x i1> [[TMP41]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-VF4IC4-NEXT: [[PREDPHI]] = select <vscale x 4 x i1> [[TMP38]], <vscale x 4 x i32> [[TMP64]], <vscale x 4 x i32> [[VEC_PHI]]
-; CHECK-VF4IC4-NEXT: [[PREDPHI10]] = select <vscale x 4 x i1> [[TMP39]], <vscale x 4 x i32> [[TMP65]], <vscale x 4 x i32> [[VEC_PHI1]]
-; CHECK-VF4IC4-NEXT: [[PREDPHI11]] = select <vscale x 4 x i1> [[TMP40]], <vscale x 4 x i32> [[TMP66]], <vscale x 4 x i32> [[VEC_PHI2]]
-; CHECK-VF4IC4-NEXT: [[PREDPHI12]] = select <vscale x 4 x i1> [[TMP41]], <vscale x 4 x i32> [[TMP67]], <vscale x 4 x i32> [[VEC_PHI3]]
-; CHECK-VF4IC4-NEXT: [[TMP72:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-VF4IC4-NEXT: [[TMP73:%.*]] = mul i64 [[TMP72]], 16
-; CHECK-VF4IC4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP73]]
-; CHECK-VF4IC4-NEXT: [[TMP74:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF4IC4-NEXT: br i1 [[TMP74]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK-VF4IC4: middle.block:
-; CHECK-VF4IC4-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne <vscale x 4 x i32> [[PREDPHI]], zeroinitializer
-; CHECK-VF4IC4-NEXT: [[RDX_SELECT:%.*]] = select <vscale x 4 x i1> [[RDX_SELECT_CMP]], <vscale x 4 x i32> [[PREDPHI]], <vscale x 4 x i32> [[PREDPHI10]]
-; CHECK-VF4IC4-NEXT: [[RDX_SELECT_CMP13:%.*]] = icmp ne <vscale x 4 x i32> [[RDX_SELECT]], zeroinitializer
-; CHECK-VF4IC4-NEXT: [[RDX_SELECT14:%.*]] = select <vscale x 4 x i1> [[RDX_SELECT_CMP13]], <vscale x 4 x i32> [[RDX_SELECT]], <vscale x 4 x i32> [[PREDPHI11]]
-; CHECK-VF4IC4-NEXT: [[RDX_SELECT_CMP15:%.*]] = icmp ne <vscale x 4 x i32> [[RDX_SELECT14]], zeroinitializer
-; CHECK-VF4IC4-NEXT: [[RDX_SELECT16:%.*]] = select <vscale x 4 x i1> [[RDX_SELECT_CMP15]], <vscale x 4 x i32> [[RDX_SELECT14]], <vscale x 4 x i32> [[PREDPHI12]]
-; CHECK-VF4IC4-NEXT: [[RDX_SELECT_CMP17:%.*]] = icmp ne <vscale x 4 x i32> [[RDX_SELECT16]], zeroinitializer
-; CHECK-VF4IC4-NEXT: [[TMP75:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[RDX_SELECT_CMP17]])
-; CHECK-VF4IC4-NEXT: [[RDX_SELECT18:%.*]] = select i1 [[TMP75]], i32 1, i32 0
-; CHECK-VF4IC4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-VF4IC4-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-VF4IC4: scalar.ph:
-; CHECK-VF4IC4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-VF4IC4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_SELECT18]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF4IC4-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-VF4IC4: for.body:
-; CHECK-VF4IC4-NEXT: [[I_013:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-VF4IC4-NEXT: [[R_012:%.*]] = phi i32 [ [[R_1:%.*]], [[FOR_INC]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-VF4IC4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[SRC1]], i64 [[I_013]]
-; CHECK-VF4IC4-NEXT: [[TMP76:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-VF4IC4-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP76]], 35
-; CHECK-VF4IC4-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
-; CHECK-VF4IC4: if.then:
-; CHECK-VF4IC4-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[SRC2]], i64 [[I_013]]
-; CHECK-VF4IC4-NEXT: [[TMP77:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
-; CHECK-VF4IC4-NEXT: [[CMP3:%.*]] = icmp eq i32 [[TMP77]], 2
-; CHECK-VF4IC4-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[CMP3]], i32 1, i32 [[R_012]]
-; CHECK-VF4IC4-NEXT: br label [[FOR_INC]]
-; CHECK-VF4IC4: for.inc:
-; CHECK-VF4IC4-NEXT: [[R_1]] = phi i32 [ [[R_012]], [[FOR_BODY]] ], [ [[SPEC_SELECT]], [[IF_THEN]] ]
-; CHECK-VF4IC4-NEXT: [[INC]] = add nuw nsw i64 [[I_013]], 1
-; CHECK-VF4IC4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-VF4IC4-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK-VF4IC4: for.end.loopexit:
-; CHECK-VF4IC4-NEXT: [[R_1_LCSSA:%.*]] = phi i32 [ [[R_1]], [[FOR_INC]] ], [ [[RDX_SELECT18]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF4IC4-NEXT: ret i32 [[R_1_LCSSA]]
-;
-
+; CHECK-VF4IC1-LABEL: @pred_select_const_i32_from_icmp
+; CHECK-VF4IC1: vector.body:
+; CHECK-VF4IC1: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %vector.ph ], [ [[VEC_SEL:%.*]], %vector.body ]
+; CHECK-VF4IC1: [[VEC_LOAD:%.*]] = load <vscale x 4 x i32>
+; CHECK-VF4IC1: [[MASK:%.*]] = icmp sgt <vscale x 4 x i32> [[VEC_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 35, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC1: [[MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* {{%.*}}, i32 4, <vscale x 4 x i1> [[MASK]], <vscale x 4 x i32> poison)
+; CHECK-VF4IC1-NEXT: [[VEC_ICMP:%.*]] = icmp eq <vscale x 4 x i32> [[MASKED_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 2, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-VF4IC1-NEXT: [[VEC_SEL_TMP:%.*]] = select <vscale x 4 x i1> [[VEC_ICMP]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i32 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> [[VEC_PHI]]
+; CHECK-VF4IC1: [[VEC_SEL:%.*]] = select <vscale x 4 x i1> [[MASK]], <vscale x 4 x i32> [[VEC_SEL_TMP]], <vscale x 4 x i32> [[VEC_PHI]]
+; CHECK-VF4IC1: middle.block:
+; CHECK-VF4IC1-NEXT: [[FIN_ICMP:%.*]] = icmp ne <vscale x 4 x i32> [[VEC_SEL]], zeroinitializer
+; CHECK-VF4IC1-NEXT: [[OR_RDX:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[FIN_ICMP]])
+; CHECK-VF4IC1-NEXT: {{.*}} = select i1 [[OR_RDX]], i32 1, i32 0
+
+; CHECK-VF4IC4-LABEL: @pred_select_const_i32_from_icmp
+; CHECK-VF4IC4: vector.body:
entry:
br label %for.body
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -loop-vectorize < %s | FileCheck %s
; These tests ensure that tail-folding is enabled when the predicate.enable
; CHECK-LABEL: @simple_memset(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT: br i1 false, label %scalar.ph, label %vector.ph
; CHECK: vector.ph:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]]
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL:%.*]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL:%.*]], i32 0
+; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT5]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT: br label %vector.body
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX1]], 0
-; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP5]], i64 [[UMAX]])
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP6]], i32 0
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <vscale x 4 x i32>*
-; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x i32>* [[TMP8]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
-; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP10]]
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT2:%.*]], %vector.body ]
+; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX1]], 0
+; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP8]], i64 [[UMAX]])
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP8]]
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP10]], i32 0
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <vscale x 4 x i32>*
+; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> [[BROADCAST_SPLAT6]], <vscale x 4 x i32>* [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4
+; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP14]]
+; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP15]], label %middle.block, label %vector.body
; CHECK: middle.block:
-; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[WHILE_BODY:%.*]]
-; CHECK: while.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[INDEX]]
-; CHECK-NEXT: store i32 [[VAL]], i32* [[GEP]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
-; CHECK-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: while.end.loopexit:
-; CHECK-NEXT: ret void
+; CHECK-NEXT: br i1 true, label %while.end.loopexit, label %scalar.ph
;
entry:
br label %while.body
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -hints-allow-reordering=false -loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -prefer-inloop-reductions < %s | FileCheck %s
; RUN: opt -S -hints-allow-reordering=false -loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -prefer-inloop-reductions < %s | FileCheck %s
; CHECK-LABEL: @simple_memset(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT: br i1 false, label %scalar.ph, label %vector.ph
; CHECK: vector.ph:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]]
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL:%.*]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL:%.*]], i32 0
+; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT5]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT: br label %vector.body
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX1]], 0
-; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP5]], i64 [[UMAX]])
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP6]], i32 0
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <vscale x 4 x i32>*
-; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x i32>* [[TMP8]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
-; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP10]]
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT2:%.*]], %vector.body ]
+; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX1]], 0
+; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP8]], i64 [[UMAX]])
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP8]]
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP10]], i32 0
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <vscale x 4 x i32>*
+; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> [[BROADCAST_SPLAT6]], <vscale x 4 x i32>* [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4
+; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP14]]
+; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP15]], label %middle.block, label %vector.body
; CHECK: middle.block:
-; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[WHILE_BODY:%.*]]
-; CHECK: while.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[INDEX]]
-; CHECK-NEXT: store i32 [[VAL]], i32* [[GEP]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
-; CHECK-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: while.end.loopexit:
-; CHECK-NEXT: ret void
+; CHECK-NEXT: br i1 true, label %while.end.loopexit, label %scalar.ph
;
entry:
br label %while.body
; CHECK-LABEL: @simple_memcpy(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT: br i1 false, label %scalar.ph, label %vector.ph
; CHECK: vector.ph:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]]
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT: br label %vector.body
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX1]], 0
-; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP5]], i64 [[UMAX]])
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[SRC:%.*]], i64 [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP6]], i32 0
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <vscale x 4 x i32>*
-; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP8]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[TMP5]]
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 0
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <vscale x 4 x i32>*
-; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x i32>* [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4
-; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP13]]
-; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT2:%.*]], %vector.body ]
+; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX1]], 0
+; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP8]], i64 [[UMAX]])
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[SRC:%.*]], i64 [[TMP8]]
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP10]], i32 0
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <vscale x 4 x i32>*
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[TMP8]]
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, i32* [[TMP13]], i32 0
+; CHECK-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <vscale x 4 x i32>*
+; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x i32>* [[TMP15]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4
+; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP17]]
+; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP18]], label %middle.block, label %vector.body, !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[WHILE_BODY:%.*]]
-; CHECK: while.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i32, i32* [[SRC]], i64 [[INDEX]]
-; CHECK-NEXT: [[VAL:%.*]] = load i32, i32* [[GEP1]], align 4
-; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i32, i32* [[DST]], i64 [[INDEX]]
-; CHECK-NEXT: store i32 [[VAL]], i32* [[GEP2]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
-; CHECK-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK: while.end.loopexit:
-; CHECK-NEXT: ret void
+; CHECK-NEXT: br i1 true, label %while.end.loopexit, label %scalar.ph
;
entry:
br label %while.body
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[UMAX]], -1
; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 2
; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT: br i1 false, label %scalar.ph, label %vector.ph
; CHECK: vector.ph:
; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 4
; CHECK-NEXT: [[TMP13:%.*]] = mul i64 4, [[TMP12]]
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP13]], i32 0
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT: br label %vector.body
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[INDEX1]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT2:%.*]], %vector.body ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[INDEX1]], i32 0
+; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT3]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP14:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
; CHECK-NEXT: [[TMP15:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP14]]
-; CHECK-NEXT: [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[TMP15]]
+; CHECK-NEXT: [[VEC_IV:%.*]] = add <vscale x 4 x i64> [[BROADCAST_SPLAT4]], [[TMP15]]
; CHECK-NEXT: [[TMP16:%.*]] = extractelement <vscale x 4 x i64> [[VEC_IV]], i32 0
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP16]], i64 [[TMP2]])
; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i32, i32* [[SRC:%.*]], <vscale x 4 x i64> [[VEC_IND]]
; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP20]]
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP21]], label %middle.block, label %vector.body
; CHECK: middle.block:
-; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[WHILE_BODY:%.*]]
-; CHECK: while.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i32, i32* [[SRC]], i64 [[INDEX]]
-; CHECK-NEXT: [[VAL:%.*]] = load i32, i32* [[GEP1]], align 4
-; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i32, i32* [[DST]], i64 [[INDEX]]
-; CHECK-NEXT: store i32 [[VAL]], i32* [[GEP2]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 4
-; CHECK-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK: while.end.loopexit:
-; CHECK-NEXT: ret void
+; CHECK-NEXT: br i1 true, label %while.end.loopexit, label %scalar.ph
;
entry:
br label %while.body
; CHECK-LABEL: @simple_gather_scatter(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT: br i1 false, label %scalar.ph, label %vector.ph
; CHECK: vector.ph:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX1]], 0
-; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP5]], i64 [[UMAX]])
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[IND:%.*]], i64 [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP6]], i32 0
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <vscale x 4 x i32>*
-; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP8]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[SRC:%.*]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> [[TMP9]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> undef)
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[DST:%.*]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> [[WIDE_MASKED_GATHER]], <vscale x 4 x i32*> [[TMP10]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 4
-; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP12]]
-; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT2:%.*]], %vector.body ]
+; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX1]], 0
+; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP8]], i64 [[UMAX]])
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[IND:%.*]], i64 [[TMP8]]
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP10]], i32 0
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <vscale x 4 x i32>*
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[SRC:%.*]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> undef)
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, i32* [[DST:%.*]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> [[WIDE_MASKED_GATHER]], <vscale x 4 x i32*> [[TMP14]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4
+; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]]
+; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP17]], label %middle.block, label %vector.body
; CHECK: middle.block:
-; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[WHILE_BODY:%.*]]
-; CHECK: while.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i32, i32* [[IND]], i64 [[INDEX]]
-; CHECK-NEXT: [[IND_VAL:%.*]] = load i32, i32* [[GEP1]], align 4
-; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i32, i32* [[SRC]], i32 [[IND_VAL]]
-; CHECK-NEXT: [[VAL:%.*]] = load i32, i32* [[GEP2]], align 4
-; CHECK-NEXT: [[GEP3:%.*]] = getelementptr i32, i32* [[DST]], i32 [[IND_VAL]]
-; CHECK-NEXT: store i32 [[VAL]], i32* [[GEP3]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
-; CHECK-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK: while.end.loopexit:
-; CHECK-NEXT: ret void
+; CHECK-NEXT: br i1 true, label %while.end.loopexit, label %scalar.ph
;
entry:
br label %while.body
define void @uniform_load(i32* noalias %dst, i32* noalias readonly %src, i64 %n) #0 {
; CHECK-LABEL: @uniform_load(
; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT: br i1 false, label %scalar.ph, label %vector.ph
; CHECK: vector.ph:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP4]]
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT: br label %vector.body
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP5]], i64 [[N]])
-; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[SRC:%.*]], align 4
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP6]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 [[TMP5]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <vscale x 4 x i32>*
-; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x i32>* [[TMP9]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP11]]
-; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP8]], i64 %n)
+; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[SRC:%.*]], align 4
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP10]], i32 0
+; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT3]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 [[TMP8]]
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <vscale x 4 x i32>*
+; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> [[BROADCAST_SPLAT4]], <vscale x 4 x i32>* [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP15]]
+; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP16]], label %middle.block, label %vector.body, !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[VAL:%.*]] = load i32, i32* [[SRC]], align 4
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store i32 [[VAL]], i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK-NEXT: br i1 true, label %for.end, label %scalar.ph
;
entry:
define void @cond_uniform_load(i32* noalias %dst, i32* noalias readonly %src, i32* noalias readonly %cond, i64 %n) #0 {
; CHECK-LABEL: @cond_uniform_load(
; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT: br i1 false, label %scalar.ph, label %vector.ph
; CHECK: vector.ph:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP4]]
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32*> poison, i32* [[SRC:%.*]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32*> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32*> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 4 x i32*> poison, i32* [[SRC:%.*]], i32 0
+; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 4 x i32*> [[BROADCAST_SPLATINSERT5]], <vscale x 4 x i32*> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT: br label %vector.body
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX1]], 0
-; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP5]], i64 [[N]])
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[COND:%.*]], i64 [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <vscale x 4 x i32>*
-; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP8]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
-; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], zeroinitializer
-; CHECK-NEXT: [[TMP10:%.*]] = xor <vscale x 4 x i1> [[TMP9]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP11:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> zeroinitializer
-; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> [[BROADCAST_SPLAT]], i32 4, <vscale x 4 x i1> [[TMP11]], <vscale x 4 x i32> undef)
-; CHECK-NEXT: [[TMP12:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i1> zeroinitializer
-; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP12]], <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> [[WIDE_MASKED_GATHER]]
-; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 [[TMP5]]
-; CHECK-NEXT: [[TMP14:%.*]] = or <vscale x 4 x i1> [[TMP11]], [[TMP12]]
-; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i32 0
-; CHECK-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <vscale x 4 x i32>*
-; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> [[PREDPHI]], <vscale x 4 x i32>* [[TMP16]], i32 4, <vscale x 4 x i1> [[TMP14]])
-; CHECK-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP18:%.*]] = mul i64 [[TMP17]], 4
-; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP18]]
-; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT2:%.*]], %vector.body ]
+; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX1]], 0
+; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP8]], i64 %n)
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[COND:%.*]], i64 [[TMP8]]
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <vscale x 4 x i32>*
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT: [[TMP13:%.*]] = icmp eq <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], zeroinitializer
+; CHECK-NEXT: [[TMP14:%.*]] = xor <vscale x 4 x i1> [[TMP13]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT: [[TMP15:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP14]], <vscale x 4 x i1> zeroinitializer
+; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0i32(<vscale x 4 x i32*> [[BROADCAST_SPLAT6]], i32 4, <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> undef)
+; CHECK-NEXT: [[TMP16:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i1> zeroinitializer
+; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> [[WIDE_MASKED_GATHER]]
+; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 [[TMP8]]
+; CHECK-NEXT: [[TMP18:%.*]] = or <vscale x 4 x i1> [[TMP15]], [[TMP16]]
+; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP17]], i32 0
+; CHECK-NEXT: [[TMP20:%.*]] = bitcast i32* [[TMP19]] to <vscale x 4 x i32>*
+; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> [[PREDPHI]], <vscale x 4 x i32>* [[TMP20]], i32 4, <vscale x 4 x i1> [[TMP18]])
+; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 4
+; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP22]]
+; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP23]], label %middle.block, label %vector.body
; CHECK: middle.block:
-; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[IF_END:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[COND]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP20:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP20]], 0
-; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END]], label [[IF_THEN:%.*]]
-; CHECK: if.then:
-; CHECK-NEXT: [[TMP21:%.*]] = load i32, i32* [[SRC]], align 4
-; CHECK-NEXT: br label [[IF_END]]
-; CHECK: if.end:
-; CHECK-NEXT: [[VAL_0:%.*]] = phi i32 [ [[TMP21]], [[IF_THEN]] ], [ 0, [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 [[INDEX]]
-; CHECK-NEXT: store i32 [[VAL_0]], i32* [[ARRAYIDX1]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK-NEXT: br i1 true, label %for.end, label %scalar.ph
;
entry:
define void @uniform_store(i32* noalias %dst, i32* noalias readonly %src, i64 %n) #0 {
; CHECK-LABEL: @uniform_store(
; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT: br i1 false, label %scalar.ph, label %vector.ph
; CHECK: vector.ph:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP4]]
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32*> poison, i32* [[DST:%.*]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32*> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32*> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x i32*> poison, i32* [[DST:%.*]], i32 0
+; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 4 x i32*> [[BROADCAST_SPLATINSERT3]], <vscale x 4 x i32*> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT: br label %vector.body
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP5]], i64 [[N]])
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <vscale x 4 x i32>*
-; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP8]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
-; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x i32*> [[BROADCAST_SPLAT]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP10]]
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP8]], i64 %n)
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 [[TMP8]]
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <vscale x 4 x i32>*
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0i32(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x i32*> [[BROADCAST_SPLAT4]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4
+; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP14]]
+; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP15]], label %middle.block, label %vector.body
; CHECK: middle.block:
-; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[VAL:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: store i32 [[VAL]], i32* [[DST]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK-NEXT: br i1 true, label %for.end, label %scalar.ph
;
entry:
; CHECK-LABEL: @simple_fdiv(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT: br i1 false, label %scalar.ph, label %vector.ph
; CHECK: vector.ph:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]]
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT: br label %vector.body
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX1]], 0
-; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP5]], i64 [[UMAX]])
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr float, float* [[SRC:%.*]], i64 [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr float, float* [[DST:%.*]], i64 [[TMP5]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr float, float* [[TMP6]], i32 0
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <vscale x 4 x float>*
-; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>* [[TMP9]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr float, float* [[TMP7]], i32 0
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to <vscale x 4 x float>*
-; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>* [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
-; CHECK-NEXT: [[TMP12:%.*]] = fdiv <vscale x 4 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD2]]
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP10]] to <vscale x 4 x float>*
-; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0nxv4f32(<vscale x 4 x float> [[TMP12]], <vscale x 4 x float>* [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 4
-; CHECK-NEXT: [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP15]]
-; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT3]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT2:%.*]], %vector.body ]
+; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX1]], 0
+; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP8]], i64 [[UMAX]])
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr float, float* [[SRC:%.*]], i64 [[TMP8]]
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr float, float* [[DST:%.*]], i64 [[TMP8]]
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr float, float* [[TMP10]], i32 0
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <vscale x 4 x float>*
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>* [[TMP13]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr float, float* [[TMP11]], i32 0
+; CHECK-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <vscale x 4 x float>*
+; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>* [[TMP15]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
+; CHECK-NEXT: [[TMP16:%.*]] = fdiv <vscale x 4 x float> [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD5]]
+; CHECK-NEXT: [[TMP17:%.*]] = bitcast float* [[TMP14]] to <vscale x 4 x float>*
+; CHECK-NEXT: call void @llvm.masked.store.nxv4f32.p0nxv4f32(<vscale x 4 x float> [[TMP16]], <vscale x 4 x float>* [[TMP17]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 4
+; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP19]]
+; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP20]], label %middle.block, label %vector.body, !llvm.loop [[LOOP18:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[WHILE_BODY:%.*]]
-; CHECK: while.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[GEP1:%.*]] = getelementptr float, float* [[SRC]], i64 [[INDEX]]
-; CHECK-NEXT: [[GEP2:%.*]] = getelementptr float, float* [[DST]], i64 [[INDEX]]
-; CHECK-NEXT: [[VAL1:%.*]] = load float, float* [[GEP1]], align 4
-; CHECK-NEXT: [[VAL2:%.*]] = load float, float* [[GEP2]], align 4
-; CHECK-NEXT: [[RES:%.*]] = fdiv float [[VAL1]], [[VAL2]]
-; CHECK-NEXT: store float [[RES]], float* [[GEP2]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
-; CHECK-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP17:![0-9]+]]
-; CHECK: while.end.loopexit:
-; CHECK-NEXT: ret void
+; CHECK-NEXT: br i1 true, label %while.end.loopexit, label %scalar.ph
;
entry:
br label %while.body
; CHECK-LABEL: @add_reduction_i32(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label %vector.ph
; CHECK: vector.ph:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]]
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT: br label %vector.body
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX1]], 0
-; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP5]], i64 [[UMAX]])
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP6]], i32 0
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <vscale x 4 x i32>*
-; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP8]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
-; CHECK-NEXT: [[TMP9:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP9]])
-; CHECK-NEXT: [[TMP11]] = add i32 [[TMP10]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 4
-; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP13]]
-; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT2:%.*]], %vector.body ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, %vector.ph ], [ [[TMP14:%.*]], %vector.body ]
+; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX1]], 0
+; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP8]], i64 [[UMAX]])
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i64 [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 0
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <vscale x 4 x i32>*
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP11]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
+; CHECK-NEXT: [[TMP12:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP12]])
+; CHECK-NEXT: [[TMP14]] = add i32 [[TMP13]], [[VEC_PHI]]
+; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4
+; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]]
+; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label %vector.body, !llvm.loop [[LOOP18:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[WHILE_BODY:%.*]]
-; CHECK: while.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[RED:%.*]] = phi i32 [ [[RED_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[GEP:%.*]] = getelementptr i32, i32* [[PTR]], i64 [[INDEX]]
-; CHECK-NEXT: [[VAL:%.*]] = load i32, i32* [[GEP]], align 4
-; CHECK-NEXT: [[RED_NEXT]] = add i32 [[RED]], [[VAL]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
-; CHECK-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP19:![0-9]+]]
-; CHECK: while.end.loopexit:
-; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], [[WHILE_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: ret i32 [[RED_NEXT_LCSSA]]
+; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label %scalar.ph
;
entry:
br label %while.body
; CHECK-LABEL: @add_reduction_f32(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT: br i1 false, label %scalar.ph, label %vector.ph
; CHECK: vector.ph:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP4]]
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT: br label %vector.body
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX1]], 0
-; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP5]], i64 [[UMAX]])
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr float, float* [[PTR:%.*]], i64 [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr float, float* [[TMP6]], i32 0
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP7]] to <vscale x 4 x float>*
-; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>* [[TMP8]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
-; CHECK-NEXT: [[TMP9:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP10]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP9]])
-; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 4
-; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP12]]
-; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT2:%.*]], %vector.body ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[TMP14:%.*]], %vector.body ]
+; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX1]], 0
+; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP8]], i64 [[UMAX]])
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr float, float* [[PTR:%.*]], i64 [[TMP8]]
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr float, float* [[TMP10]], i32 0
+; CHECK-NEXT: [[TMP12:%.*]] = bitcast float* [[TMP11]] to <vscale x 4 x float>*
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0nxv4f32(<vscale x 4 x float>* [[TMP12]], i32 4, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> poison)
+; CHECK-NEXT: [[TMP13:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float -0.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT: [[TMP14]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], <vscale x 4 x float> [[TMP13]])
+; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4
+; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP16]]
+; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP17]], label %middle.block, label %vector.body, !llvm.loop [[LOOP22:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[WHILE_BODY:%.*]]
-; CHECK: while.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[RED:%.*]] = phi float [ [[RED_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[GEP:%.*]] = getelementptr float, float* [[PTR]], i64 [[INDEX]]
-; CHECK-NEXT: [[VAL:%.*]] = load float, float* [[GEP]], align 4
-; CHECK-NEXT: [[RED_NEXT]] = fadd float [[RED]], [[VAL]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
-; CHECK-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP21:![0-9]+]]
-; CHECK: while.end.loopexit:
-; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi float [ [[RED_NEXT]], [[WHILE_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: ret float [[RED_NEXT_LCSSA]]
+; CHECK-NEXT: br i1 true, label %while.end.loopexit, label %scalar.ph
;
entry:
br label %while.body
define i32 @cond_xor_reduction(i32* noalias %a, i32* noalias %cond, i64 %N) #0 {
; CHECK-LABEL: @cond_xor_reduction(
; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT: br i1 false, label %scalar.ph, label %vector.ph
; CHECK: vector.ph:
; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], [[TMP4]]
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT: br label %vector.body
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 7, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 7, %vector.ph ], [ [[TMP16:%.*]], %vector.body ]
; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[TMP5]], i64 [[N]])
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[COND:%.*]], i64 [[TMP5]]
; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 7, [[ENTRY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; CHECK-NEXT: [[RDX:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RES:%.*]], [[FOR_INC]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[COND]], i64 [[IV]]
-; CHECK-NEXT: [[TMP20:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[TMP20]], 5
-; CHECK-NEXT: br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]]
-; CHECK: if.then:
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
-; CHECK-NEXT: [[TMP21:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[XOR:%.*]] = xor i32 [[RDX]], [[TMP21]]
-; CHECK-NEXT: br label [[FOR_INC]]
-; CHECK: for.inc:
-; CHECK-NEXT: [[RES]] = phi i32 [ [[RDX]], [[FOR_BODY]] ], [ [[XOR]], [[IF_THEN]] ]
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: [[RES_LCSSA:%.*]] = phi i32 [ [[RES]], [[FOR_INC]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: ret i32 [[RES_LCSSA]]
-;
+; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label %scalar.ph
entry:
br label %for.body
; divides for scalable vectors we just don't bother vectorizing.
define void @simple_idiv(i32* noalias %dst, i32* noalias %src, i64 %n) #0 {
; CHECK-LABEL: @simple_idiv(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[WHILE_BODY:%.*]]
-; CHECK: while.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i32, i32* [[SRC:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[VAL1:%.*]] = load i32, i32* [[GEP1]], align 4
-; CHECK-NEXT: [[VAL2:%.*]] = load i32, i32* [[GEP2]], align 4
-; CHECK-NEXT: [[RES:%.*]] = udiv i32 [[VAL1]], [[VAL2]]
-; CHECK-NEXT: store i32 [[RES]], i32* [[GEP2]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
-; CHECK-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N:%.*]]
-; CHECK-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT:%.*]], !llvm.loop [[LOOP24:![0-9]+]]
-; CHECK: while.end.loopexit:
-; CHECK-NEXT: ret void
+; CHECK-NOT: vector.body
;
entry:
br label %while.body
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -loop-vectorize -dce -instcombine < %s -S | FileCheck %s
target triple = "aarch64-unknown-linux-gnu"
define void @f16_to_f32(float* noalias nocapture %dst, half* noalias nocapture readonly %src, i64 %N) #0 {
; CHECK-LABEL: @f16_to_f32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 3
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 3
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds half, half* [[SRC:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast half* [[TMP4]] to <vscale x 8 x half>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x half>, <vscale x 8 x half>* [[TMP5]], align 2
-; CHECK-NEXT: [[TMP6:%.*]] = fpext <vscale x 8 x half> [[WIDE_LOAD]] to <vscale x 8 x float>
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP7]] to <vscale x 8 x float>*
-; CHECK-NEXT: store <vscale x 8 x float> [[TMP6]], <vscale x 8 x float>* [[TMP8]], align 4
-; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP10:%.*]] = shl i64 [[TMP9]], 3
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I_07:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds half, half* [[SRC]], i64 [[I_07]]
-; CHECK-NEXT: [[TMP12:%.*]] = load half, half* [[ARRAYIDX]], align 2
-; CHECK-NEXT: [[CONV:%.*]] = fpext half [[TMP12]] to float
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[DST]], i64 [[I_07]]
-; CHECK-NEXT: store float [[CONV]], float* [[ARRAYIDX1]], align 4
-; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_07]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK: vector.body
+; CHECK: %{{.*}} = fpext <vscale x 8 x half> %{{.*}} to <vscale x 8 x float>
entry:
br label %for.body
define void @f64_to_f32(float* noalias nocapture %dst, double* noalias nocapture readonly %src, i64 %N) #0 {
; CHECK-LABEL: @f64_to_f32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 3
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 3
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, double* [[SRC:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP4]] to <vscale x 8 x double>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x double>, <vscale x 8 x double>* [[TMP5]], align 8
-; CHECK-NEXT: [[TMP6:%.*]] = fptrunc <vscale x 8 x double> [[WIDE_LOAD]] to <vscale x 8 x float>
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP7]] to <vscale x 8 x float>*
-; CHECK-NEXT: store <vscale x 8 x float> [[TMP6]], <vscale x 8 x float>* [[TMP8]], align 4
-; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP10:%.*]] = shl i64 [[TMP9]], 3
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I_07:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[SRC]], i64 [[I_07]]
-; CHECK-NEXT: [[TMP12:%.*]] = load double, double* [[ARRAYIDX]], align 8
-; CHECK-NEXT: [[CONV:%.*]] = fptrunc double [[TMP12]] to float
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[DST]], i64 [[I_07]]
-; CHECK-NEXT: store float [[CONV]], float* [[ARRAYIDX1]], align 4
-; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_07]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK: vector.body
+; CHECK: %{{.*}} = fptrunc <vscale x 8 x double> %{{.*}} to <vscale x 8 x float>
entry:
br label %for.body
define void @f16_to_s8(i8* noalias nocapture %dst, half* noalias nocapture readonly %src, i64 %N) #0 {
; CHECK-LABEL: @f16_to_s8(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 3
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 3
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds half, half* [[SRC:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast half* [[TMP4]] to <vscale x 8 x half>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x half>, <vscale x 8 x half>* [[TMP5]], align 2
-; CHECK-NEXT: [[TMP6:%.*]] = fptosi <vscale x 8 x half> [[WIDE_LOAD]] to <vscale x 8 x i8>
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[DST:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <vscale x 8 x i8>*
-; CHECK-NEXT: store <vscale x 8 x i8> [[TMP6]], <vscale x 8 x i8>* [[TMP8]], align 1
-; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP10:%.*]] = shl i64 [[TMP9]], 3
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds half, half* [[SRC]], i64 [[I_08]]
-; CHECK-NEXT: [[TMP12:%.*]] = load half, half* [[ARRAYIDX]], align 2
-; CHECK-NEXT: [[CONV1:%.*]] = fptosi half [[TMP12]] to i8
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 [[I_08]]
-; CHECK-NEXT: store i8 [[CONV1]], i8* [[ARRAYIDX2]], align 1
-; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK: vector.body
+; CHECK: %{{.*}} = fptosi <vscale x 8 x half> %{{.*}} to <vscale x 8 x i8>
entry:
br label %for.body
define void @f32_to_u64(i64* noalias nocapture %dst, float* noalias nocapture readonly %src, i64 %N) #0 {
; CHECK-LABEL: @f32_to_u64(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 3
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 3
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <vscale x 8 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = fptoui <vscale x 8 x float> [[WIDE_LOAD]] to <vscale x 8 x i64>
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, i64* [[DST:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64* [[TMP7]] to <vscale x 8 x i64>*
-; CHECK-NEXT: store <vscale x 8 x i64> [[TMP6]], <vscale x 8 x i64>* [[TMP8]], align 8
-; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP10:%.*]] = shl i64 [[TMP9]], 3
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I_07:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 [[I_07]]
-; CHECK-NEXT: [[TMP12:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[CONV:%.*]] = fptoui float [[TMP12]] to i64
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[DST]], i64 [[I_07]]
-; CHECK-NEXT: store i64 [[CONV]], i64* [[ARRAYIDX1]], align 8
-; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_07]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK: vector.body
+; CHECK: %{{.*}} = fptoui <vscale x 8 x float> %{{.*}} to <vscale x 8 x i64>
entry:
br label %for.body
define void @s8_to_f32(float* noalias nocapture %dst, i8* noalias nocapture readonly %src, i64 %N) #0 {
; CHECK-LABEL: @s8_to_f32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 3
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 3
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <vscale x 8 x i8>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, <vscale x 8 x i8>* [[TMP5]], align 1
-; CHECK-NEXT: [[TMP6:%.*]] = sitofp <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x float>
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP7]] to <vscale x 8 x float>*
-; CHECK-NEXT: store <vscale x 8 x float> [[TMP6]], <vscale x 8 x float>* [[TMP8]], align 4
-; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP10:%.*]] = shl i64 [[TMP9]], 3
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I_07:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[I_07]]
-; CHECK-NEXT: [[TMP12:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i8 [[TMP12]] to float
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[DST]], i64 [[I_07]]
-; CHECK-NEXT: store float [[CONV]], float* [[ARRAYIDX1]], align 4
-; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_07]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK: vector.body
+; CHECK: %{{.*}} = sitofp <vscale x 8 x i8> %{{.*}} to <vscale x 8 x float>
entry:
br label %for.body
define void @u16_to_f32(float* noalias nocapture %dst, i16* noalias nocapture readonly %src, i64 %N) #0 {
; CHECK-LABEL: @u16_to_f32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 3
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 3
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[SRC:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16* [[TMP4]] to <vscale x 8 x i16>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i16>, <vscale x 8 x i16>* [[TMP5]], align 2
-; CHECK-NEXT: [[TMP6:%.*]] = uitofp <vscale x 8 x i16> [[WIDE_LOAD]] to <vscale x 8 x float>
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP7]] to <vscale x 8 x float>*
-; CHECK-NEXT: store <vscale x 8 x float> [[TMP6]], <vscale x 8 x float>* [[TMP8]], align 4
-; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP10:%.*]] = shl i64 [[TMP9]], 3
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I_07:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[SRC]], i64 [[I_07]]
-; CHECK-NEXT: [[TMP12:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
-; CHECK-NEXT: [[CONV:%.*]] = uitofp i16 [[TMP12]] to float
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[DST]], i64 [[I_07]]
-; CHECK-NEXT: store float [[CONV]], float* [[ARRAYIDX1]], align 4
-; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_07]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK: vector.body
+; CHECK: %{{.*}} = uitofp <vscale x 8 x i16> %{{.*}} to <vscale x 8 x float>
entry:
br label %for.body
define void @u64_to_f16(half* noalias nocapture %dst, i64* noalias nocapture readonly %src, i64 %N) #0 {
; CHECK-LABEL: @u64_to_f16(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 3
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 3
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, i64* [[SRC:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i64* [[TMP4]] to <vscale x 8 x i64>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i64>, <vscale x 8 x i64>* [[TMP5]], align 8
-; CHECK-NEXT: [[TMP6:%.*]] = uitofp <vscale x 8 x i64> [[WIDE_LOAD]] to <vscale x 8 x half>
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds half, half* [[DST:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast half* [[TMP7]] to <vscale x 8 x half>*
-; CHECK-NEXT: store <vscale x 8 x half> [[TMP6]], <vscale x 8 x half>* [[TMP8]], align 2
-; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP10:%.*]] = shl i64 [[TMP9]], 3
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[SRC]], i64 [[I_08]]
-; CHECK-NEXT: [[TMP12:%.*]] = load i64, i64* [[ARRAYIDX]], align 8
-; CHECK-NEXT: [[CONV1:%.*]] = uitofp i64 [[TMP12]] to half
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds half, half* [[DST]], i64 [[I_08]]
-; CHECK-NEXT: store half [[CONV1]], half* [[ARRAYIDX2]], align 2
-; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK: vector.body
+; CHECK: %{{.*}} = uitofp <vscale x 8 x i64> %{{.*}} to <vscale x 8 x half>
entry:
br label %for.body
define void @s64_to_f16(half* noalias nocapture %dst, i64* noalias nocapture readonly %src, i64 %N) #0 {
; CHECK-LABEL: @s64_to_f16(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 3
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 3
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, i64* [[SRC:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i64* [[TMP4]] to <vscale x 8 x i64>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i64>, <vscale x 8 x i64>* [[TMP5]], align 8
-; CHECK-NEXT: [[TMP6:%.*]] = sitofp <vscale x 8 x i64> [[WIDE_LOAD]] to <vscale x 8 x half>
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds half, half* [[DST:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast half* [[TMP7]] to <vscale x 8 x half>*
-; CHECK-NEXT: store <vscale x 8 x half> [[TMP6]], <vscale x 8 x half>* [[TMP8]], align 2
-; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP10:%.*]] = shl i64 [[TMP9]], 3
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I_08:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[SRC]], i64 [[I_08]]
-; CHECK-NEXT: [[TMP12:%.*]] = load i64, i64* [[ARRAYIDX]], align 8
-; CHECK-NEXT: [[CONV1:%.*]] = sitofp i64 [[TMP12]] to half
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds half, half* [[DST]], i64 [[I_08]]
-; CHECK-NEXT: store half [[CONV1]], half* [[ARRAYIDX2]], align 2
-; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK: vector.body
+; CHECK: %{{.*}} = sitofp <vscale x 8 x i64> %{{.*}} to <vscale x 8 x half>
entry:
br label %for.body
define void @s8_to_s32(i32* noalias nocapture %dst, i8* noalias nocapture readonly %src, i64 %N) #0 {
; CHECK-LABEL: @s8_to_s32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 3
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 3
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <vscale x 8 x i8>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, <vscale x 8 x i8>* [[TMP5]], align 1
-; CHECK-NEXT: [[TMP6:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <vscale x 8 x i32>*
-; CHECK-NEXT: store <vscale x 8 x i32> [[TMP6]], <vscale x 8 x i32>* [[TMP8]], align 4
-; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP10:%.*]] = shl i64 [[TMP9]], 3
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I_07:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[I_07]]
-; CHECK-NEXT: [[TMP12:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[CONV:%.*]] = sext i8 [[TMP12]] to i32
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 [[I_07]]
-; CHECK-NEXT: store i32 [[CONV]], i32* [[ARRAYIDX1]], align 4
-; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_07]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK: vector.body
+; CHECK: %{{.*}} = sext <vscale x 8 x i8> %{{.*}} to <vscale x 8 x i32>
entry:
br label %for.body
define void @u8_to_u16(i16* noalias nocapture %dst, i8* noalias nocapture readonly %src, i64 %N) #0 {
; CHECK-LABEL: @u8_to_u16(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 3
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 3
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <vscale x 8 x i8>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, <vscale x 8 x i8>* [[TMP5]], align 1
-; CHECK-NEXT: [[TMP6:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i16>
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[DST:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16* [[TMP7]] to <vscale x 8 x i16>*
-; CHECK-NEXT: store <vscale x 8 x i16> [[TMP6]], <vscale x 8 x i16>* [[TMP8]], align 2
-; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP10:%.*]] = shl i64 [[TMP9]], 3
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I_07:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[I_07]]
-; CHECK-NEXT: [[TMP12:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP12]] to i16
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, i16* [[DST]], i64 [[I_07]]
-; CHECK-NEXT: store i16 [[CONV]], i16* [[ARRAYIDX1]], align 2
-; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_07]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK: vector.body
+; CHECK: %{{.*}} = zext <vscale x 8 x i8> %{{.*}} to <vscale x 8 x i16>
entry:
br label %for.body
define void @s64_to_s8(i8* noalias nocapture %dst, i64* noalias nocapture readonly %src, i64 %N) #0 {
; CHECK-LABEL: @s64_to_s8(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 3
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N:%.*]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 3
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, i64* [[SRC:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i64* [[TMP4]] to <vscale x 8 x i64>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i64>, <vscale x 8 x i64>* [[TMP5]], align 8
-; CHECK-NEXT: [[TMP6:%.*]] = trunc <vscale x 8 x i64> [[WIDE_LOAD]] to <vscale x 8 x i8>
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[DST:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <vscale x 8 x i8>*
-; CHECK-NEXT: store <vscale x 8 x i8> [[TMP6]], <vscale x 8 x i8>* [[TMP8]], align 1
-; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP10:%.*]] = shl i64 [[TMP9]], 3
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I_07:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[SRC]], i64 [[I_07]]
-; CHECK-NEXT: [[TMP12:%.*]] = load i64, i64* [[ARRAYIDX]], align 8
-; CHECK-NEXT: [[CONV:%.*]] = trunc i64 [[TMP12]] to i8
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 [[I_07]]
-; CHECK-NEXT: store i8 [[CONV]], i8* [[ARRAYIDX1]], align 1
-; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_07]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK: vector.body
+; CHECK: %{{.*}} = trunc <vscale x 8 x i64> %{{.*}} to <vscale x 8 x i8>
entry:
br label %for.body
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; This is the loop in c++ being vectorize in this file with
; experimental.vector.reverse
target triple = "aarch64-unknown-linux-gnu"
define void @vector_reverse_mask_nxv4i1(double* %a, double* %cond, i64 %N) #0 {
-; CHECK-LABEL: @vector_reverse_mask_nxv4i1(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i64 [[N:%.*]], 0
-; CHECK-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK: for.body.preheader:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 2
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[N]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK: vector.memcheck:
-; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr double, double* [[A:%.*]], i64 [[N]]
-; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr double, double* [[COND:%.*]], i64 [[N]]
-; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt double* [[SCEVGEP4]], [[A]]
-; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt double* [[SCEVGEP]], [[COND]]
-; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 2
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = xor i64 [[INDEX]], -1
-; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP4]], [[N]]
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, double* [[COND]], i64 [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[DOTNEG:%.*]] = mul i32 [[TMP7]], -4
-; CHECK-NEXT: [[TMP8:%.*]] = or i32 [[DOTNEG]], 1
-; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds double, double* [[TMP6]], i64 [[TMP9]]
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast double* [[TMP10]] to <vscale x 4 x double>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x double>, <vscale x 4 x double>* [[TMP11]], align 8, !alias.scope !0
-; CHECK-NEXT: [[REVERSE:%.*]] = call <vscale x 4 x double> @llvm.experimental.vector.reverse.nxv4f64(<vscale x 4 x double> [[WIDE_LOAD]])
-; CHECK-NEXT: [[TMP12:%.*]] = fcmp une <vscale x 4 x double> [[REVERSE]], zeroinitializer
-; CHECK-NEXT: [[TMP13:%.*]] = getelementptr double, double* [[A]], i64 [[TMP5]]
-; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[DOTNEG10:%.*]] = mul i32 [[TMP14]], -4
-; CHECK-NEXT: [[TMP15:%.*]] = or i32 [[DOTNEG10]], 1
-; CHECK-NEXT: [[TMP16:%.*]] = sext i32 [[TMP15]] to i64
-; CHECK-NEXT: [[TMP17:%.*]] = getelementptr double, double* [[TMP13]], i64 [[TMP16]]
-; CHECK-NEXT: [[REVERSE6:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[TMP12]])
-; CHECK-NEXT: [[TMP18:%.*]] = bitcast double* [[TMP17]] to <vscale x 4 x double>*
-; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0nxv4f64(<vscale x 4 x double>* [[TMP18]], i32 8, <vscale x 4 x i1> [[REVERSE6]], <vscale x 4 x double> poison), !alias.scope !3, !noalias !0
-; CHECK-NEXT: [[TMP19:%.*]] = fadd <vscale x 4 x double> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 4 x double> insertelement (<vscale x 4 x double> poison, double 1.000000e+00, i32 0), <vscale x 4 x double> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP20:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[DOTNEG11:%.*]] = mul i32 [[TMP20]], -4
-; CHECK-NEXT: [[TMP21:%.*]] = or i32 [[DOTNEG11]], 1
-; CHECK-NEXT: [[TMP22:%.*]] = sext i32 [[TMP21]] to i64
-; CHECK-NEXT: [[TMP23:%.*]] = getelementptr double, double* [[TMP13]], i64 [[TMP22]]
-; CHECK-NEXT: [[REVERSE9:%.*]] = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> [[TMP12]])
-; CHECK-NEXT: [[TMP24:%.*]] = bitcast double* [[TMP23]] to <vscale x 4 x double>*
-; CHECK-NEXT: call void @llvm.masked.store.nxv4f64.p0nxv4f64(<vscale x 4 x double> [[TMP19]], <vscale x 4 x double>* [[TMP24]], i32 8, <vscale x 4 x i1> [[REVERSE9]]), !alias.scope !3, !noalias !0
-; CHECK-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP26:%.*]] = shl i64 [[TMP25]], 2
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP26]]
-; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_MOD_VF]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ], [ [[N]], [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.cond.cleanup.loopexit:
-; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: ret void
-; CHECK: for.body:
-; CHECK-NEXT: [[I_08_IN:%.*]] = phi i64 [ [[I_08:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[I_08]] = add nsw i64 [[I_08_IN]], -1
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[COND]], i64 [[I_08]]
-; CHECK-NEXT: [[TMP28:%.*]] = load double, double* [[ARRAYIDX]], align 8
-; CHECK-NEXT: [[TOBOOL:%.*]] = fcmp une double [[TMP28]], 0.000000e+00
-; CHECK-NEXT: br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]]
-; CHECK: if.then:
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[I_08]]
-; CHECK-NEXT: [[TMP29:%.*]] = load double, double* [[ARRAYIDX1]], align 8
-; CHECK-NEXT: [[ADD:%.*]] = fadd double [[TMP29]], 1.000000e+00
-; CHECK-NEXT: store double [[ADD]], double* [[ARRAYIDX1]], align 8
-; CHECK-NEXT: br label [[FOR_INC]]
-; CHECK: for.inc:
-; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i64 [[I_08_IN]], 1
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP8:![0-9]+]]
-;
+; CHECK-LABEL: vector.body:
+; CHECK: %[[REVERSE6:.*]] = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %{{.*}})
+; CHECK: %[[WIDEMSKLOAD:.*]] = call <vscale x 4 x double> @llvm.masked.load.nxv4f64.p0nxv4f64(<vscale x 4 x double>* %{{.*}}, i32 8, <vscale x 4 x i1> %[[REVERSE6]], <vscale x 4 x double> poison)
+; CHECK-NEXT: %[[FADD:.*]] = fadd <vscale x 4 x double> %[[WIDEMSKLOAD]]
+; CHECK: %[[REVERSE9:.*]] = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %{{.*}})
+; CHECK: call void @llvm.masked.store.nxv4f64.p0nxv4f64(<vscale x 4 x double> %[[FADD]], <vscale x 4 x double>* %{{.*}}, i32 8, <vscale x 4 x i1> %[[REVERSE9]]
entry:
%cmp7 = icmp sgt i64 %N, 0
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -loop-vectorize < %s | FileCheck %s
target triple = "aarch64-unknown-linux-gnu"
define void @widen_extractvalue(i64* %dst, {i64, i64} %sv) #0 {
; CHECK-LABEL: @widen_extractvalue(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], 2
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 0, [[TMP1]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP3:%.*]] = mul i32 [[TMP2]], 2
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 0, [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 0, [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { i64, i64 } [[SV:%.*]], 0
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP5]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { i64, i64 } [[SV]], 1
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP6]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i64, i64* [[DST:%.*]], i32 [[TMP4]]
-; CHECK-NEXT: [[TMP8:%.*]] = add <vscale x 2 x i64> [[BROADCAST_SPLAT]], [[BROADCAST_SPLAT2]]
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i64, i64* [[TMP7]], i32 0
-; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64* [[TMP9]] to <vscale x 2 x i64>*
-; CHECK-NEXT: store <vscale x 2 x i64> [[TMP8]], <vscale x 2 x i64>* [[TMP10]], align 4
-; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP12:%.*]] = mul i32 [[TMP11]], 2
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP12]]
-; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 0, [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[LOOP_BODY:%.*]]
-; CHECK: loop.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_BODY]] ]
-; CHECK-NEXT: [[A:%.*]] = extractvalue { i64, i64 } [[SV]], 0
-; CHECK-NEXT: [[B:%.*]] = extractvalue { i64, i64 } [[SV]], 1
-; CHECK-NEXT: [[ADDR:%.*]] = getelementptr i64, i64* [[DST]], i32 [[IV]]
-; CHECK-NEXT: [[ADD:%.*]] = add i64 [[A]], [[B]]
-; CHECK-NEXT: store i64 [[ADD]], i64* [[ADDR]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nsw i32 [[IV]], 1
-; CHECK-NEXT: [[COND:%.*]] = icmp ne i32 [[IV_NEXT]], 0
-; CHECK-NEXT: br i1 [[COND]], label [[LOOP_BODY]], label [[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK: exit:
-; CHECK-NEXT: ret void
-;
+; CHECK: vector.body:
+; CHECK: [[EXTRACT0:%.*]] = extractvalue { i64, i64 } [[SV:%.*]], 0
+; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[EXTRACT0]], i32 0
+; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT: [[EXTRACT1:%.*]] = extractvalue { i64, i64 } [[SV]], 1
+; CHECK-NEXT: [[DOTSPLATINSERT1:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[EXTRACT1]], i32 0
+; CHECK-NEXT: [[DOTSPLAT2:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK: [[ADD:%.*]] = add <vscale x 2 x i64> [[DOTSPLAT]], [[DOTSPLAT2]]
entry:
br label %loop.body
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -loop-vectorize -scalable-vectorization=off -force-vector-width=4 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S < %s | FileCheck %s
; NOTE: These tests aren't really target-specific, but it's convenient to target AArch64
; we don't artificially create new predicated blocks for the load.
define void @uniform_load(i32* noalias %dst, i32* noalias readonly %src, i64 %n) #0 {
; CHECK-LABEL: @uniform_load(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N:%.*]], 3
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[TMP0]], i64 [[N]])
-; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[SRC:%.*]], align 4
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
-; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[BROADCAST_SPLAT]], <4 x i32>* [[TMP4]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[VAL:%.*]] = load i32, i32* [[SRC]], align 4
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store i32 [[VAL]], i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK-NEXT: [[IDX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[IDX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[IDX]], 0
+; CHECK-NEXT: [[LOOP_PRED:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[TMP3]], i64 %n)
+; CHECK-NEXT: [[LOAD_VAL:%.*]] = load i32, i32* %src, align 4
+; CHECK-NOT: load i32, i32* %src, align 4
+; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> poison, i32 [[LOAD_VAL]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* %dst, i64 [[TMP3]]
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0
+; CHECK-NEXT: [[STORE_PTR:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
+; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP5]], <4 x i32>* [[STORE_PTR]], i32 4, <4 x i1> [[LOOP_PRED]])
+; CHECK-NEXT: [[IDX_NEXT]] = add i64 [[IDX]], 4
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IDX_NEXT]], %n.vec
+; CHECK-NEXT: br i1 [[CMP]], label %middle.block, label %vector.body
entry:
br label %for.body
; and the original condition.
define void @cond_uniform_load(i32* nocapture %dst, i32* nocapture readonly %src, i32* nocapture readonly %cond, i64 %n) #0 {
; CHECK-LABEL: @cond_uniform_load(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[DST1:%.*]] = bitcast i32* [[DST:%.*]] to i8*
-; CHECK-NEXT: [[COND3:%.*]] = bitcast i32* [[COND:%.*]] to i8*
-; CHECK-NEXT: [[SRC6:%.*]] = bitcast i32* [[SRC:%.*]] to i8*
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK: vector.memcheck:
-; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[DST]], i64 [[N:%.*]]
-; CHECK-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
-; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[COND]], i64 [[N]]
-; CHECK-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8*
-; CHECK-NEXT: [[SCEVGEP7:%.*]] = getelementptr i32, i32* [[SRC]], i64 1
-; CHECK-NEXT: [[SCEVGEP78:%.*]] = bitcast i32* [[SCEVGEP7]] to i8*
-; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[DST1]], [[SCEVGEP45]]
-; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[COND3]], [[SCEVGEP2]]
-; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT: [[BOUND09:%.*]] = icmp ult i8* [[DST1]], [[SCEVGEP78]]
-; CHECK-NEXT: [[BOUND110:%.*]] = icmp ult i8* [[SRC6]], [[SCEVGEP2]]
-; CHECK-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]]
-; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]]
-; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
-; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], 3
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32*> poison, i32* [[SRC]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32*> [[BROADCAST_SPLATINSERT]], <4 x i32*> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* %src, i32 0
+; CHECK-NEXT: [[SRC_SPLAT:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> poison, <4 x i32> zeroinitializer
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX12:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT13:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX12]], 0
-; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[TMP0]], i64 [[N]])
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[COND]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison), !alias.scope !4
-; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <4 x i32> [[WIDE_MASKED_LOAD]], zeroinitializer
+; CHECK-NEXT: [[IDX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[IDX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[IDX]], 0
+; CHECK-NEXT: [[LOOP_PRED:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[TMP3]], i64 %n)
+; CHECK: [[COND_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{%.*}}, i32 4, <4 x i1> [[LOOP_PRED]], <4 x i32> poison)
+; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <4 x i32> [[COND_LOAD]], zeroinitializer
; CHECK-NEXT: [[TMP5:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP5]], <4 x i1> zeroinitializer
-; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[BROADCAST_SPLAT]], i32 4, <4 x i1> [[TMP6]], <4 x i32> undef), !alias.scope !7
-; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i1> [[TMP4]], <4 x i1> zeroinitializer
-; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP7]], <4 x i32> zeroinitializer, <4 x i32> [[WIDE_MASKED_GATHER]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP9:%.*]] = or <4 x i1> [[TMP6]], [[TMP7]]
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
-; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[PREDPHI]], <4 x i32>* [[TMP11]], i32 4, <4 x i1> [[TMP9]]), !alias.scope !9, !noalias !11
-; CHECK-NEXT: [[INDEX_NEXT13]] = add i64 [[INDEX12]], 4
-; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[IF_END:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[COND]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP13]], 0
-; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END]], label [[IF_THEN:%.*]]
-; CHECK: if.then:
-; CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[SRC]], align 4
-; CHECK-NEXT: br label [[IF_END]]
-; CHECK: if.end:
-; CHECK-NEXT: [[VAL_0:%.*]] = phi i32 [ [[TMP14]], [[IF_THEN]] ], [ 0, [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 [[INDEX]]
-; CHECK-NEXT: store i32 [[VAL_0]], i32* [[ARRAYIDX1]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK-NEXT: [[MASK:%.*]] = select <4 x i1> [[LOOP_PRED]], <4 x i1> [[TMP5]], <4 x i1> zeroinitializer
+; CHECK-NEXT: call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[SRC_SPLAT]], i32 4, <4 x i1> [[MASK]], <4 x i32> undef)
entry:
br label %for.body
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -vector-library=Darwin_libsystem_m -inject-tli-mappings -loop-vectorize -S | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
declare float @expf(float) nounwind readnone
define void @expf_v4f32(i64 %n, float* noalias %y, float* noalias %x) {
; CHECK-LABEL: @expf_v4f32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 4
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
-; CHECK-NEXT: [[TMP8:%.*]] = call <4 x float> @_simd_exp_f4(<4 x float> [[WIDE_LOAD]])
-; CHECK-NEXT: [[TMP9:%.*]] = call <4 x float> @_simd_exp_f4(<4 x float> [[WIDE_LOAD1]])
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP8]], <4 x float>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 4
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP9]], <4 x float>* [[TMP15]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[GEP_Y:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[IV]]
-; CHECK-NEXT: [[LV:%.*]] = load float, float* [[GEP_Y]], align 4
-; CHECK-NEXT: [[CALL:%.*]] = tail call float @expf(float [[LV]]) #[[ATTR1:[0-9]+]]
-; CHECK-NEXT: [[GEP_X:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[IV]]
-; CHECK-NEXT: store float [[CALL]], float* [[GEP_X]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK: call <4 x float> @_simd_exp_f4(
+; CHECK: ret void
entry:
br label %for.body
declare double @exp(double) nounwind readnone
define void @exp_v2f64(i64 %n, double* noalias %y, double * noalias %x) {
; CHECK-LABEL: @exp_v2f64(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, double* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP4]] to <2 x double>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 2
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[TMP6]] to <2 x double>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x double>, <2 x double>* [[TMP7]], align 4
-; CHECK-NEXT: [[TMP8:%.*]] = call <2 x double> @_simd_exp_d2(<2 x double> [[WIDE_LOAD]])
-; CHECK-NEXT: [[TMP9:%.*]] = call <2 x double> @_simd_exp_d2(<2 x double> [[WIDE_LOAD1]])
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds double, double* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast double* [[TMP12]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP8]], <2 x double>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 2
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP14]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[TMP15]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[GEP_Y:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[IV]]
-; CHECK-NEXT: [[LV:%.*]] = load double, double* [[GEP_Y]], align 4
-; CHECK-NEXT: [[CALL:%.*]] = tail call double @exp(double [[LV]]) #[[ATTR2:[0-9]+]]
-; CHECK-NEXT: [[GEP_X:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[IV]]
-; CHECK-NEXT: store double [[CALL]], double* [[GEP_X]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK: call <2 x double> @_simd_exp_d2(
+; CHECK: ret void
entry:
br label %for.body
declare float @acosf(float) nounwind readnone
define void @acos_v4f32(i64 %n, float* noalias %y, float* noalias %x) {
; CHECK-LABEL: @acos_v4f32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 4
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
-; CHECK-NEXT: [[TMP8:%.*]] = call <4 x float> @_simd_acos_f4(<4 x float> [[WIDE_LOAD]])
-; CHECK-NEXT: [[TMP9:%.*]] = call <4 x float> @_simd_acos_f4(<4 x float> [[WIDE_LOAD1]])
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP8]], <4 x float>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 4
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP9]], <4 x float>* [[TMP15]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[GEP_Y:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[IV]]
-; CHECK-NEXT: [[LV:%.*]] = load float, float* [[GEP_Y]], align 4
-; CHECK-NEXT: [[CALL:%.*]] = tail call float @acosf(float [[LV]]) #[[ATTR3:[0-9]+]]
-; CHECK-NEXT: [[GEP_X:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[IV]]
-; CHECK-NEXT: store float [[CALL]], float* [[GEP_X]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK: call <4 x float> @_simd_acos_f4(
+; CHECK: ret void
entry:
br label %for.body
declare double @acos(double) nounwind readnone
define void @acos_v2f64(i64 %n, double* noalias %y, double * noalias %x) {
; CHECK-LABEL: @acos_v2f64(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, double* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP4]] to <2 x double>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 2
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[TMP6]] to <2 x double>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x double>, <2 x double>* [[TMP7]], align 4
-; CHECK-NEXT: [[TMP8:%.*]] = call <2 x double> @_simd_acos_d2(<2 x double> [[WIDE_LOAD]])
-; CHECK-NEXT: [[TMP9:%.*]] = call <2 x double> @_simd_acos_d2(<2 x double> [[WIDE_LOAD1]])
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds double, double* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast double* [[TMP12]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP8]], <2 x double>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 2
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP14]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[TMP15]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[GEP_Y:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[IV]]
-; CHECK-NEXT: [[LV:%.*]] = load double, double* [[GEP_Y]], align 4
-; CHECK-NEXT: [[CALL:%.*]] = tail call double @acos(double [[LV]]) #[[ATTR4:[0-9]+]]
-; CHECK-NEXT: [[GEP_X:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[IV]]
-; CHECK-NEXT: store double [[CALL]], double* [[GEP_X]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK: call <2 x double> @_simd_acos_d2(
+; CHECK: ret void
entry:
br label %for.body
declare float @asinf(float) nounwind readnone
define void @asinf_v4f32(i64 %n, float* noalias %y, float* noalias %x) {
; CHECK-LABEL: @asinf_v4f32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 4
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
-; CHECK-NEXT: [[TMP8:%.*]] = call <4 x float> @_simd_asin_f4(<4 x float> [[WIDE_LOAD]])
-; CHECK-NEXT: [[TMP9:%.*]] = call <4 x float> @_simd_asin_f4(<4 x float> [[WIDE_LOAD1]])
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP8]], <4 x float>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 4
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP9]], <4 x float>* [[TMP15]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[GEP_Y:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[IV]]
-; CHECK-NEXT: [[LV:%.*]] = load float, float* [[GEP_Y]], align 4
-; CHECK-NEXT: [[CALL:%.*]] = tail call float @asinf(float [[LV]]) #[[ATTR5:[0-9]+]]
-; CHECK-NEXT: [[GEP_X:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[IV]]
-; CHECK-NEXT: store float [[CALL]], float* [[GEP_X]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK: call <4 x float> @_simd_asin_f4(
+; CHECK: ret void
entry:
br label %for.body
declare double @asin(double) nounwind readnone
define void @asin_v2f64(i64 %n, double* noalias %y, double * noalias %x) {
; CHECK-LABEL: @asin_v2f64(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, double* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP4]] to <2 x double>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 2
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[TMP6]] to <2 x double>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x double>, <2 x double>* [[TMP7]], align 4
-; CHECK-NEXT: [[TMP8:%.*]] = call <2 x double> @_simd_asin_d2(<2 x double> [[WIDE_LOAD]])
-; CHECK-NEXT: [[TMP9:%.*]] = call <2 x double> @_simd_asin_d2(<2 x double> [[WIDE_LOAD1]])
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds double, double* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast double* [[TMP12]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP8]], <2 x double>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 2
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP14]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[TMP15]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[GEP_Y:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[IV]]
-; CHECK-NEXT: [[LV:%.*]] = load double, double* [[GEP_Y]], align 4
-; CHECK-NEXT: [[CALL:%.*]] = tail call double @asin(double [[LV]]) #[[ATTR6:[0-9]+]]
-; CHECK-NEXT: [[GEP_X:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[IV]]
-; CHECK-NEXT: store double [[CALL]], double* [[GEP_X]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK: call <2 x double> @_simd_asin_d2(
+; CHECK: ret void
entry:
br label %for.body
ret void
}
- declare float @atanf(float) nounwind readnone
+ declare float @atanf(float) nounwind readnone
define void @atanf_v4f32(i64 %n, float* noalias %y, float* noalias %x) {
; CHECK-LABEL: @atanf_v4f32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 4
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
-; CHECK-NEXT: [[TMP8:%.*]] = call <4 x float> @_simd_atan_f4(<4 x float> [[WIDE_LOAD]])
-; CHECK-NEXT: [[TMP9:%.*]] = call <4 x float> @_simd_atan_f4(<4 x float> [[WIDE_LOAD1]])
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP8]], <4 x float>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 4
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP9]], <4 x float>* [[TMP15]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[GEP_Y:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[IV]]
-; CHECK-NEXT: [[LV:%.*]] = load float, float* [[GEP_Y]], align 4
-; CHECK-NEXT: [[CALL:%.*]] = tail call float @atanf(float [[LV]]) #[[ATTR7:[0-9]+]]
-; CHECK-NEXT: [[GEP_X:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[IV]]
-; CHECK-NEXT: store float [[CALL]], float* [[GEP_X]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK: call <4 x float> @_simd_atan_f4(
+; CHECK: ret void
entry:
br label %for.body
declare double @atan(double) nounwind readnone
define void @atan_v2f64(i64 %n, double* noalias %y, double * noalias %x) {
; CHECK-LABEL: @atan_v2f64(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, double* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP4]] to <2 x double>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 2
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[TMP6]] to <2 x double>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x double>, <2 x double>* [[TMP7]], align 4
-; CHECK-NEXT: [[TMP8:%.*]] = call <2 x double> @_simd_atan_d2(<2 x double> [[WIDE_LOAD]])
-; CHECK-NEXT: [[TMP9:%.*]] = call <2 x double> @_simd_atan_d2(<2 x double> [[WIDE_LOAD1]])
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds double, double* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast double* [[TMP12]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP8]], <2 x double>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 2
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP14]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[TMP15]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[GEP_Y:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[IV]]
-; CHECK-NEXT: [[LV:%.*]] = load double, double* [[GEP_Y]], align 4
-; CHECK-NEXT: [[CALL:%.*]] = tail call double @atan(double [[LV]]) #[[ATTR8:[0-9]+]]
-; CHECK-NEXT: [[GEP_X:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[IV]]
-; CHECK-NEXT: store double [[CALL]], double* [[GEP_X]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK: call <2 x double> @_simd_atan_d2(
+; CHECK: ret void
entry:
br label %for.body
declare float @atan2f(float) nounwind readnone
define void @atan2f_v4f32(i64 %n, float* noalias %y, float* noalias %x) {
; CHECK-LABEL: @atan2f_v4f32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 4
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
-; CHECK-NEXT: [[TMP8:%.*]] = call <4 x float> @_simd_atan2_f4(<4 x float> [[WIDE_LOAD]])
-; CHECK-NEXT: [[TMP9:%.*]] = call <4 x float> @_simd_atan2_f4(<4 x float> [[WIDE_LOAD1]])
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP8]], <4 x float>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 4
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP9]], <4 x float>* [[TMP15]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[GEP_Y:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[IV]]
-; CHECK-NEXT: [[LV:%.*]] = load float, float* [[GEP_Y]], align 4
-; CHECK-NEXT: [[CALL:%.*]] = tail call float @atan2f(float [[LV]]) #[[ATTR9:[0-9]+]]
-; CHECK-NEXT: [[GEP_X:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[IV]]
-; CHECK-NEXT: store float [[CALL]], float* [[GEP_X]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK: call <4 x float> @_simd_atan2_f4(
+; CHECK: ret void
entry:
br label %for.body
declare double @atan2(double) nounwind readnone
define void @atan2_v2f64(i64 %n, double* noalias %y, double * noalias %x) {
; CHECK-LABEL: @atan2_v2f64(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, double* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP4]] to <2 x double>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 2
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[TMP6]] to <2 x double>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x double>, <2 x double>* [[TMP7]], align 4
-; CHECK-NEXT: [[TMP8:%.*]] = call <2 x double> @_simd_atan2_d2(<2 x double> [[WIDE_LOAD]])
-; CHECK-NEXT: [[TMP9:%.*]] = call <2 x double> @_simd_atan2_d2(<2 x double> [[WIDE_LOAD1]])
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds double, double* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast double* [[TMP12]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP8]], <2 x double>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 2
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP14]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[TMP15]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[GEP_Y:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[IV]]
-; CHECK-NEXT: [[LV:%.*]] = load double, double* [[GEP_Y]], align 4
-; CHECK-NEXT: [[CALL:%.*]] = tail call double @atan2(double [[LV]]) #[[ATTR10:[0-9]+]]
-; CHECK-NEXT: [[GEP_X:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[IV]]
-; CHECK-NEXT: store double [[CALL]], double* [[GEP_X]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK: call <2 x double> @_simd_atan2_d2(
+; CHECK: ret void
entry:
br label %for.body
declare float @cosf(float) nounwind readnone
define void @cosf_v4f32(i64 %n, float* noalias %y, float* noalias %x) {
; CHECK-LABEL: @cosf_v4f32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 4
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
-; CHECK-NEXT: [[TMP8:%.*]] = call <4 x float> @_simd_cos_f4(<4 x float> [[WIDE_LOAD]])
-; CHECK-NEXT: [[TMP9:%.*]] = call <4 x float> @_simd_cos_f4(<4 x float> [[WIDE_LOAD1]])
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP8]], <4 x float>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 4
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP9]], <4 x float>* [[TMP15]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[GEP_Y:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[IV]]
-; CHECK-NEXT: [[LV:%.*]] = load float, float* [[GEP_Y]], align 4
-; CHECK-NEXT: [[CALL:%.*]] = tail call float @cosf(float [[LV]]) #[[ATTR11:[0-9]+]]
-; CHECK-NEXT: [[GEP_X:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[IV]]
-; CHECK-NEXT: store float [[CALL]], float* [[GEP_X]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK: call <4 x float> @_simd_cos_f4(
+; CHECK: ret void
entry:
br label %for.body
declare double @cos(double) nounwind readnone
define void @cos_v2f64(i64 %n, double* noalias %y, double * noalias %x) {
; CHECK-LABEL: @cos_v2f64(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, double* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP4]] to <2 x double>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 2
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[TMP6]] to <2 x double>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x double>, <2 x double>* [[TMP7]], align 4
-; CHECK-NEXT: [[TMP8:%.*]] = call <2 x double> @_simd_cos_d2(<2 x double> [[WIDE_LOAD]])
-; CHECK-NEXT: [[TMP9:%.*]] = call <2 x double> @_simd_cos_d2(<2 x double> [[WIDE_LOAD1]])
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds double, double* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast double* [[TMP12]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP8]], <2 x double>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 2
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP14]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[TMP15]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[GEP_Y:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[IV]]
-; CHECK-NEXT: [[LV:%.*]] = load double, double* [[GEP_Y]], align 4
-; CHECK-NEXT: [[CALL:%.*]] = tail call double @cos(double [[LV]]) #[[ATTR12:[0-9]+]]
-; CHECK-NEXT: [[GEP_X:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[IV]]
-; CHECK-NEXT: store double [[CALL]], double* [[GEP_X]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK: call <2 x double> @_simd_cos_d2(
+; CHECK: ret void
entry:
br label %for.body
declare float @cbrtf(float) nounwind readnone
define void @cbrtf_v4f32(i64 %n, float* noalias %y, float* noalias %x) {
; CHECK-LABEL: @cbrtf_v4f32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 4
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
-; CHECK-NEXT: [[TMP8:%.*]] = call <4 x float> @_simd_cbrt_f4(<4 x float> [[WIDE_LOAD]])
-; CHECK-NEXT: [[TMP9:%.*]] = call <4 x float> @_simd_cbrt_f4(<4 x float> [[WIDE_LOAD1]])
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP8]], <4 x float>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 4
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP9]], <4 x float>* [[TMP15]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[GEP_Y:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[IV]]
-; CHECK-NEXT: [[LV:%.*]] = load float, float* [[GEP_Y]], align 4
-; CHECK-NEXT: [[CALL:%.*]] = tail call float @cbrtf(float [[LV]]) #[[ATTR13:[0-9]+]]
-; CHECK-NEXT: [[GEP_X:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[IV]]
-; CHECK-NEXT: store float [[CALL]], float* [[GEP_X]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK: call <4 x float> @_simd_cbrt_f4(
+; CHECK: ret void
entry:
br label %for.body
declare double @cbrt(double) nounwind readnone
define void @cbrt_v2f64(i64 %n, double* noalias %y, double * noalias %x) {
; CHECK-LABEL: @cbrt_v2f64(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, double* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP4]] to <2 x double>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 2
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[TMP6]] to <2 x double>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x double>, <2 x double>* [[TMP7]], align 4
-; CHECK-NEXT: [[TMP8:%.*]] = call <2 x double> @_simd_cbrt_d2(<2 x double> [[WIDE_LOAD]])
-; CHECK-NEXT: [[TMP9:%.*]] = call <2 x double> @_simd_cbrt_d2(<2 x double> [[WIDE_LOAD1]])
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds double, double* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast double* [[TMP12]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP8]], <2 x double>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 2
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP14]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[TMP15]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[GEP_Y:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[IV]]
-; CHECK-NEXT: [[LV:%.*]] = load double, double* [[GEP_Y]], align 4
-; CHECK-NEXT: [[CALL:%.*]] = tail call double @cbrt(double [[LV]]) #[[ATTR14:[0-9]+]]
-; CHECK-NEXT: [[GEP_X:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[IV]]
-; CHECK-NEXT: store double [[CALL]], double* [[GEP_X]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK: call <2 x double> @_simd_cbrt_d2(
+; CHECK: ret void
entry:
br label %for.body
declare float @erff(float) nounwind readnone
define void @erff_v4f32(i64 %n, float* noalias %y, float* noalias %x) {
; CHECK-LABEL: @erff_v4f32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 4
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
-; CHECK-NEXT: [[TMP8:%.*]] = call <4 x float> @_simd_erf_f4(<4 x float> [[WIDE_LOAD]])
-; CHECK-NEXT: [[TMP9:%.*]] = call <4 x float> @_simd_erf_f4(<4 x float> [[WIDE_LOAD1]])
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP8]], <4 x float>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 4
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP9]], <4 x float>* [[TMP15]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[GEP_Y:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[IV]]
-; CHECK-NEXT: [[LV:%.*]] = load float, float* [[GEP_Y]], align 4
-; CHECK-NEXT: [[CALL:%.*]] = tail call float @erff(float [[LV]]) #[[ATTR15:[0-9]+]]
-; CHECK-NEXT: [[GEP_X:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[IV]]
-; CHECK-NEXT: store float [[CALL]], float* [[GEP_X]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK: call <4 x float> @_simd_erf_f4(
+; CHECK: ret void
entry:
br label %for.body
declare double @erf(double) nounwind readnone
define void @erf_v2f64(i64 %n, double* noalias %y, double * noalias %x) {
; CHECK-LABEL: @erf_v2f64(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, double* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP4]] to <2 x double>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 2
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[TMP6]] to <2 x double>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x double>, <2 x double>* [[TMP7]], align 4
-; CHECK-NEXT: [[TMP8:%.*]] = call <2 x double> @_simd_erf_d2(<2 x double> [[WIDE_LOAD]])
-; CHECK-NEXT: [[TMP9:%.*]] = call <2 x double> @_simd_erf_d2(<2 x double> [[WIDE_LOAD1]])
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds double, double* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast double* [[TMP12]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP8]], <2 x double>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 2
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP14]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[TMP15]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[GEP_Y:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[IV]]
-; CHECK-NEXT: [[LV:%.*]] = load double, double* [[GEP_Y]], align 4
-; CHECK-NEXT: [[CALL:%.*]] = tail call double @erf(double [[LV]]) #[[ATTR16:[0-9]+]]
-; CHECK-NEXT: [[GEP_X:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[IV]]
-; CHECK-NEXT: store double [[CALL]], double* [[GEP_X]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK: call <2 x double> @_simd_erf_d2(
+; CHECK: ret void
entry:
br label %for.body
declare float @powf(float) nounwind readnone
define void @powf_v4f32(i64 %n, float* noalias %y, float* noalias %x) {
; CHECK-LABEL: @powf_v4f32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 4
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
-; CHECK-NEXT: [[TMP8:%.*]] = call <4 x float> @_simd_pow_f4(<4 x float> [[WIDE_LOAD]])
-; CHECK-NEXT: [[TMP9:%.*]] = call <4 x float> @_simd_pow_f4(<4 x float> [[WIDE_LOAD1]])
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP8]], <4 x float>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 4
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP9]], <4 x float>* [[TMP15]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[GEP_Y:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[IV]]
-; CHECK-NEXT: [[LV:%.*]] = load float, float* [[GEP_Y]], align 4
-; CHECK-NEXT: [[CALL:%.*]] = tail call float @powf(float [[LV]]) #[[ATTR17:[0-9]+]]
-; CHECK-NEXT: [[GEP_X:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[IV]]
-; CHECK-NEXT: store float [[CALL]], float* [[GEP_X]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP35:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK: call <4 x float> @_simd_pow_f4(
+; CHECK: ret void
entry:
br label %for.body
declare double @pow(double) nounwind readnone
define void @pow_v2f64(i64 %n, double* noalias %y, double * noalias %x) {
; CHECK-LABEL: @pow_v2f64(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, double* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP4]] to <2 x double>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 2
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[TMP6]] to <2 x double>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x double>, <2 x double>* [[TMP7]], align 4
-; CHECK-NEXT: [[TMP8:%.*]] = call <2 x double> @_simd_pow_d2(<2 x double> [[WIDE_LOAD]])
-; CHECK-NEXT: [[TMP9:%.*]] = call <2 x double> @_simd_pow_d2(<2 x double> [[WIDE_LOAD1]])
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds double, double* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast double* [[TMP12]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP8]], <2 x double>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 2
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP14]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[TMP15]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[GEP_Y:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[IV]]
-; CHECK-NEXT: [[LV:%.*]] = load double, double* [[GEP_Y]], align 4
-; CHECK-NEXT: [[CALL:%.*]] = tail call double @pow(double [[LV]]) #[[ATTR18:[0-9]+]]
-; CHECK-NEXT: [[GEP_X:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[IV]]
-; CHECK-NEXT: store double [[CALL]], double* [[GEP_X]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK: call <2 x double> @_simd_pow_d2(
+; CHECK: ret void
entry:
br label %for.body
declare float @sinhf(float) nounwind readnone
define void @sinhf_v4f32(i64 %n, float* noalias %y, float* noalias %x) {
; CHECK-LABEL: @sinhf_v4f32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 4
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
-; CHECK-NEXT: [[TMP8:%.*]] = call <4 x float> @_simd_sinh_f4(<4 x float> [[WIDE_LOAD]])
-; CHECK-NEXT: [[TMP9:%.*]] = call <4 x float> @_simd_sinh_f4(<4 x float> [[WIDE_LOAD1]])
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP8]], <4 x float>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 4
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP9]], <4 x float>* [[TMP15]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[GEP_Y:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[IV]]
-; CHECK-NEXT: [[LV:%.*]] = load float, float* [[GEP_Y]], align 4
-; CHECK-NEXT: [[CALL:%.*]] = tail call float @sinhf(float [[LV]]) #[[ATTR19:[0-9]+]]
-; CHECK-NEXT: [[GEP_X:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[IV]]
-; CHECK-NEXT: store float [[CALL]], float* [[GEP_X]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP39:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK: call <4 x float> @_simd_sinh_f4(
+; CHECK: ret void
entry:
br label %for.body
declare double @sinh(double) nounwind readnone
define void @sinh_v2f64(i64 %n, double* noalias %y, double * noalias %x) {
; CHECK-LABEL: @sinh_v2f64(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, double* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP4]] to <2 x double>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 2
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[TMP6]] to <2 x double>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x double>, <2 x double>* [[TMP7]], align 4
-; CHECK-NEXT: [[TMP8:%.*]] = call <2 x double> @_simd_sinh_d2(<2 x double> [[WIDE_LOAD]])
-; CHECK-NEXT: [[TMP9:%.*]] = call <2 x double> @_simd_sinh_d2(<2 x double> [[WIDE_LOAD1]])
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds double, double* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast double* [[TMP12]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP8]], <2 x double>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 2
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP14]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[TMP15]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[GEP_Y:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[IV]]
-; CHECK-NEXT: [[LV:%.*]] = load double, double* [[GEP_Y]], align 4
-; CHECK-NEXT: [[CALL:%.*]] = tail call double @sinh(double [[LV]]) #[[ATTR20:[0-9]+]]
-; CHECK-NEXT: [[GEP_X:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[IV]]
-; CHECK-NEXT: store double [[CALL]], double* [[GEP_X]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK: call <2 x double> @_simd_sinh_d2(
+; CHECK: ret void
entry:
br label %for.body
declare float @coshf(float) nounwind readnone
define void @coshf_v4f32(i64 %n, float* noalias %y, float* noalias %x) {
; CHECK-LABEL: @coshf_v4f32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 4
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
-; CHECK-NEXT: [[TMP8:%.*]] = call <4 x float> @_simd_cosh_f4(<4 x float> [[WIDE_LOAD]])
-; CHECK-NEXT: [[TMP9:%.*]] = call <4 x float> @_simd_cosh_f4(<4 x float> [[WIDE_LOAD1]])
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP8]], <4 x float>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 4
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP9]], <4 x float>* [[TMP15]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[GEP_Y:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[IV]]
-; CHECK-NEXT: [[LV:%.*]] = load float, float* [[GEP_Y]], align 4
-; CHECK-NEXT: [[CALL:%.*]] = tail call float @coshf(float [[LV]]) #[[ATTR21:[0-9]+]]
-; CHECK-NEXT: [[GEP_X:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[IV]]
-; CHECK-NEXT: store float [[CALL]], float* [[GEP_X]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK: call <4 x float> @_simd_cosh_f4(
+; CHECK: ret void
entry:
br label %for.body
declare double @cosh(double) nounwind readnone
define void @cosh_v2f64(i64 %n, double* noalias %y, double * noalias %x) {
; CHECK-LABEL: @cosh_v2f64(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, double* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP4]] to <2 x double>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 2
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[TMP6]] to <2 x double>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x double>, <2 x double>* [[TMP7]], align 4
-; CHECK-NEXT: [[TMP8:%.*]] = call <2 x double> @_simd_cosh_d2(<2 x double> [[WIDE_LOAD]])
-; CHECK-NEXT: [[TMP9:%.*]] = call <2 x double> @_simd_cosh_d2(<2 x double> [[WIDE_LOAD1]])
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds double, double* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast double* [[TMP12]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP8]], <2 x double>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 2
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP14]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[TMP15]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[GEP_Y:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[IV]]
-; CHECK-NEXT: [[LV:%.*]] = load double, double* [[GEP_Y]], align 4
-; CHECK-NEXT: [[CALL:%.*]] = tail call double @cosh(double [[LV]]) #[[ATTR22:[0-9]+]]
-; CHECK-NEXT: [[GEP_X:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[IV]]
-; CHECK-NEXT: store double [[CALL]], double* [[GEP_X]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP45:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK: call <2 x double> @_simd_cosh_d2(
+; CHECK: ret void
entry:
br label %for.body
declare float @tanhf(float) nounwind readnone
define void @tanhf_v4f32(i64 %n, float* noalias %y, float* noalias %x) {
; CHECK-LABEL: @tanhf_v4f32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 4
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
-; CHECK-NEXT: [[TMP8:%.*]] = call <4 x float> @_simd_tanh_f4(<4 x float> [[WIDE_LOAD]])
-; CHECK-NEXT: [[TMP9:%.*]] = call <4 x float> @_simd_tanh_f4(<4 x float> [[WIDE_LOAD1]])
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP8]], <4 x float>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 4
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP9]], <4 x float>* [[TMP15]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP46:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[GEP_Y:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[IV]]
-; CHECK-NEXT: [[LV:%.*]] = load float, float* [[GEP_Y]], align 4
-; CHECK-NEXT: [[CALL:%.*]] = tail call float @tanhf(float [[LV]]) #[[ATTR23:[0-9]+]]
-; CHECK-NEXT: [[GEP_X:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[IV]]
-; CHECK-NEXT: store float [[CALL]], float* [[GEP_X]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP47:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK: call <4 x float> @_simd_tanh_f4(
+; CHECK: ret void
entry:
br label %for.body
declare double @tanh(double) nounwind readnone
define void @tanh_v2f64(i64 %n, double* noalias %y, double * noalias %x) {
; CHECK-LABEL: @tanh_v2f64(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, double* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP4]] to <2 x double>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 2
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[TMP6]] to <2 x double>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x double>, <2 x double>* [[TMP7]], align 4
-; CHECK-NEXT: [[TMP8:%.*]] = call <2 x double> @_simd_tanh_d2(<2 x double> [[WIDE_LOAD]])
-; CHECK-NEXT: [[TMP9:%.*]] = call <2 x double> @_simd_tanh_d2(<2 x double> [[WIDE_LOAD1]])
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds double, double* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast double* [[TMP12]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP8]], <2 x double>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 2
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP14]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[TMP15]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP48:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[GEP_Y:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[IV]]
-; CHECK-NEXT: [[LV:%.*]] = load double, double* [[GEP_Y]], align 4
-; CHECK-NEXT: [[CALL:%.*]] = tail call double @tanh(double [[LV]]) #[[ATTR24:[0-9]+]]
-; CHECK-NEXT: [[GEP_X:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[IV]]
-; CHECK-NEXT: store double [[CALL]], double* [[GEP_X]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP49:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK: call <2 x double> @_simd_tanh_d2(
+; CHECK: ret void
entry:
br label %for.body
declare float @asinhf(float) nounwind readnone
define void @asinhf_v4f32(i64 %n, float* noalias %y, float* noalias %x) {
; CHECK-LABEL: @asinhf_v4f32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 4
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
-; CHECK-NEXT: [[TMP8:%.*]] = call <4 x float> @_simd_asinh_f4(<4 x float> [[WIDE_LOAD]])
-; CHECK-NEXT: [[TMP9:%.*]] = call <4 x float> @_simd_asinh_f4(<4 x float> [[WIDE_LOAD1]])
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP8]], <4 x float>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 4
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP9]], <4 x float>* [[TMP15]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP50:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[GEP_Y:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[IV]]
-; CHECK-NEXT: [[LV:%.*]] = load float, float* [[GEP_Y]], align 4
-; CHECK-NEXT: [[CALL:%.*]] = tail call float @asinhf(float [[LV]]) #[[ATTR25:[0-9]+]]
-; CHECK-NEXT: [[GEP_X:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[IV]]
-; CHECK-NEXT: store float [[CALL]], float* [[GEP_X]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP51:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK: call <4 x float> @_simd_asinh_f4(
+; CHECK: ret void
entry:
br label %for.body
declare double @asinh(double) nounwind readnone
define void @asinh_v2f64(i64 %n, double* noalias %y, double * noalias %x) {
; CHECK-LABEL: @asinh_v2f64(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, double* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP4]] to <2 x double>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 2
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[TMP6]] to <2 x double>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x double>, <2 x double>* [[TMP7]], align 4
-; CHECK-NEXT: [[TMP8:%.*]] = call <2 x double> @_simd_asinh_d2(<2 x double> [[WIDE_LOAD]])
-; CHECK-NEXT: [[TMP9:%.*]] = call <2 x double> @_simd_asinh_d2(<2 x double> [[WIDE_LOAD1]])
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds double, double* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast double* [[TMP12]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP8]], <2 x double>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 2
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP14]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[TMP15]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP52:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[GEP_Y:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[IV]]
-; CHECK-NEXT: [[LV:%.*]] = load double, double* [[GEP_Y]], align 4
-; CHECK-NEXT: [[CALL:%.*]] = tail call double @asinh(double [[LV]]) #[[ATTR26:[0-9]+]]
-; CHECK-NEXT: [[GEP_X:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[IV]]
-; CHECK-NEXT: store double [[CALL]], double* [[GEP_X]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP53:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK: call <2 x double> @_simd_asinh_d2(
+; CHECK: ret void
entry:
br label %for.body
declare float @acoshf(float) nounwind readnone
define void @acoshf_v4f32(i64 %n, float* noalias %y, float* noalias %x) {
; CHECK-LABEL: @acoshf_v4f32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 4
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
-; CHECK-NEXT: [[TMP8:%.*]] = call <4 x float> @_simd_acosh_f4(<4 x float> [[WIDE_LOAD]])
-; CHECK-NEXT: [[TMP9:%.*]] = call <4 x float> @_simd_acosh_f4(<4 x float> [[WIDE_LOAD1]])
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP8]], <4 x float>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 4
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP9]], <4 x float>* [[TMP15]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP54:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[GEP_Y:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[IV]]
-; CHECK-NEXT: [[LV:%.*]] = load float, float* [[GEP_Y]], align 4
-; CHECK-NEXT: [[CALL:%.*]] = tail call float @acoshf(float [[LV]]) #[[ATTR27:[0-9]+]]
-; CHECK-NEXT: [[GEP_X:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[IV]]
-; CHECK-NEXT: store float [[CALL]], float* [[GEP_X]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP55:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK: call <4 x float> @_simd_acosh_f4(
+; CHECK: ret void
entry:
br label %for.body
declare double @acosh(double) nounwind readnone
define void @acosh_v2f64(i64 %n, double* noalias %y, double * noalias %x) {
; CHECK-LABEL: @acosh_v2f64(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, double* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP4]] to <2 x double>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 2
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[TMP6]] to <2 x double>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x double>, <2 x double>* [[TMP7]], align 4
-; CHECK-NEXT: [[TMP8:%.*]] = call <2 x double> @_simd_acosh_d2(<2 x double> [[WIDE_LOAD]])
-; CHECK-NEXT: [[TMP9:%.*]] = call <2 x double> @_simd_acosh_d2(<2 x double> [[WIDE_LOAD1]])
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds double, double* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast double* [[TMP12]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP8]], <2 x double>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 2
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP14]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[TMP15]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP56:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[GEP_Y:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[IV]]
-; CHECK-NEXT: [[LV:%.*]] = load double, double* [[GEP_Y]], align 4
-; CHECK-NEXT: [[CALL:%.*]] = tail call double @acosh(double [[LV]]) #[[ATTR28:[0-9]+]]
-; CHECK-NEXT: [[GEP_X:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[IV]]
-; CHECK-NEXT: store double [[CALL]], double* [[GEP_X]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP57:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK: call <2 x double> @_simd_acosh_d2(
+; CHECK: ret void
entry:
br label %for.body
declare float @atanhf(float) nounwind readnone
define void @atanhf_v4f32(i64 %n, float* noalias %y, float* noalias %x) {
; CHECK-LABEL: @atanhf_v4f32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 4
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
-; CHECK-NEXT: [[TMP8:%.*]] = call <4 x float> @_simd_atanh_f4(<4 x float> [[WIDE_LOAD]])
-; CHECK-NEXT: [[TMP9:%.*]] = call <4 x float> @_simd_atanh_f4(<4 x float> [[WIDE_LOAD1]])
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP8]], <4 x float>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 4
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP9]], <4 x float>* [[TMP15]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP58:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[GEP_Y:%.*]] = getelementptr inbounds float, float* [[Y]], i64 [[IV]]
-; CHECK-NEXT: [[LV:%.*]] = load float, float* [[GEP_Y]], align 4
-; CHECK-NEXT: [[CALL:%.*]] = tail call float @atanhf(float [[LV]]) #[[ATTR29:[0-9]+]]
-; CHECK-NEXT: [[GEP_X:%.*]] = getelementptr inbounds float, float* [[X]], i64 [[IV]]
-; CHECK-NEXT: store float [[CALL]], float* [[GEP_X]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP59:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK: call <4 x float> @_simd_atanh_f4(
+; CHECK: ret void
entry:
br label %for.body
declare double @atanh(double) nounwind readnone
define void @atanh_v2f64(i64 %n, double* noalias %y, double * noalias %x) {
; CHECK-LABEL: @atanh_v2f64(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, double* [[Y:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP4]] to <2 x double>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 2
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[TMP6]] to <2 x double>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x double>, <2 x double>* [[TMP7]], align 4
-; CHECK-NEXT: [[TMP8:%.*]] = call <2 x double> @_simd_atanh_d2(<2 x double> [[WIDE_LOAD]])
-; CHECK-NEXT: [[TMP9:%.*]] = call <2 x double> @_simd_atanh_d2(<2 x double> [[WIDE_LOAD1]])
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds double, double* [[X:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast double* [[TMP12]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP8]], <2 x double>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, double* [[TMP10]], i32 2
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP14]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[TMP15]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP60:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[GEP_Y:%.*]] = getelementptr inbounds double, double* [[Y]], i64 [[IV]]
-; CHECK-NEXT: [[LV:%.*]] = load double, double* [[GEP_Y]], align 4
-; CHECK-NEXT: [[CALL:%.*]] = tail call double @atanh(double [[LV]]) #[[ATTR30:[0-9]+]]
-; CHECK-NEXT: [[GEP_X:%.*]] = getelementptr inbounds double, double* [[X]], i64 [[IV]]
-; CHECK-NEXT: store double [[CALL]], double* [[GEP_X]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP61:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK: call <2 x double> @_simd_atanh_d2(
+; CHECK: ret void
entry:
br label %for.body
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; Test VLA for reverse with fixed size vector
; This is the loop in c++ being vectorize in this file with
; shuffle reverse
; RUN: opt -loop-vectorize -dce -mtriple aarch64-linux-gnu -S < %s | FileCheck %s
define void @vector_reverse_f64(i64 %N, double* %a, double* %b) #0 {
-; CHECK-LABEL: @vector_reverse_f64(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[A1:%.*]] = bitcast double* [[A:%.*]] to i8*
-; CHECK-NEXT: [[B3:%.*]] = bitcast double* [[B:%.*]] to i8*
-; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i64 [[N:%.*]], 0
-; CHECK-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK: for.body.preheader:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK: vector.memcheck:
-; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr double, double* [[A]], i64 [[N]]
-; CHECK-NEXT: [[SCEVGEP2:%.*]] = bitcast double* [[SCEVGEP]] to i8*
-; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr double, double* [[B]], i64 [[N]]
-; CHECK-NEXT: [[SCEVGEP45:%.*]] = bitcast double* [[SCEVGEP4]] to i8*
-; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP45]]
-; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[B3]], [[SCEVGEP2]]
-; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: [[IND_END:%.*]] = sub i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[N]], [[INDEX]]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add nsw i64 [[TMP0]], -1
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 -7
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP4]] to <8 x double>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x double>, <8 x double>* [[TMP5]], align 8, !alias.scope !0
-; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <8 x double> [[WIDE_LOAD]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[TMP6:%.*]] = fadd <8 x double> [[REVERSE]], <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP1]]
-; CHECK-NEXT: [[REVERSE6:%.*]] = shufflevector <8 x double> [[TMP6]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds double, double* [[TMP7]], i32 0
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds double, double* [[TMP8]], i32 -7
-; CHECK-NEXT: [[TMP10:%.*]] = bitcast double* [[TMP9]] to <8 x double>*
-; CHECK-NEXT: store <8 x double> [[REVERSE6]], <8 x double>* [[TMP10]], align 8, !alias.scope !3, !noalias !0
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ], [ [[N]], [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.cond.cleanup.loopexit:
-; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: ret void
-; CHECK: for.body:
-; CHECK-NEXT: [[I_08_IN:%.*]] = phi i64 [ [[I_08:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[I_08]] = add nsw i64 [[I_08_IN]], -1
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[I_08]]
-; CHECK-NEXT: [[TMP12:%.*]] = load double, double* [[ARRAYIDX]], align 8
-; CHECK-NEXT: [[ADD:%.*]] = fadd double [[TMP12]], 1.000000e+00
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[I_08]]
-; CHECK-NEXT: store double [[ADD]], double* [[ARRAYIDX1]], align 8
-; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i64 [[I_08_IN]], 1
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP8:![0-9]+]]
-;
+; CHECK-LABEL: vector_reverse_f64
+; CHECK-LABEL: vector.body
+; CHECK: %[[GEP:.*]] = getelementptr inbounds double, double* %{{.*}}, i32 0
+; CHECK-NEXT: %[[GEP1:.*]] = getelementptr inbounds double, double* %[[GEP]], i32 -7
+; CHECK-NEXT: %[[CAST:.*]] = bitcast double* %[[GEP1]] to <8 x double>*
+; CHECK-NEXT: %[[WIDE:.*]] = load <8 x double>, <8 x double>* %[[CAST]], align 8
+; CHECK-NEXT: %[[REVERSE:.*]] = shufflevector <8 x double> %[[WIDE]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT: %[[FADD:.*]] = fadd <8 x double> %[[REVERSE]]
+; CHECK-NEXT: %[[GEP2:.*]] = getelementptr inbounds double, double* {{.*}}, i64 {{.*}}
+; CHECK-NEXT: %[[REVERSE6:.*]] = shufflevector <8 x double> %[[FADD]], <8 x double> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT: %[[GEP3:.*]] = getelementptr inbounds double, double* %[[GEP2]], i32 0
+; CHECK-NEXT: %[[GEP4:.*]] = getelementptr inbounds double, double* %[[GEP3]], i32 -7
+; CHECK-NEXT: %[[CAST:.*]] = bitcast double* %[[GEP4]] to <8 x double>*
+; CHECK-NEXT: store <8 x double> %[[REVERSE6]], <8 x double>* %[[CAST]], align 8
entry:
%cmp7 = icmp sgt i64 %N, 0
}
define void @vector_reverse_i64(i64 %N, i64* %a, i64* %b) #0 {
-; CHECK-LABEL: @vector_reverse_i64(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[A1:%.*]] = bitcast i64* [[A:%.*]] to i8*
-; CHECK-NEXT: [[B3:%.*]] = bitcast i64* [[B:%.*]] to i8*
-; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i64 [[N:%.*]], 0
-; CHECK-NEXT: br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK: for.body.preheader:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK: vector.memcheck:
-; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i64, i64* [[A]], i64 [[N]]
-; CHECK-NEXT: [[SCEVGEP2:%.*]] = bitcast i64* [[SCEVGEP]] to i8*
-; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i64, i64* [[B]], i64 [[N]]
-; CHECK-NEXT: [[SCEVGEP45:%.*]] = bitcast i64* [[SCEVGEP4]] to i8*
-; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP45]]
-; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[B3]], [[SCEVGEP2]]
-; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: [[IND_END:%.*]] = sub i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[N]], [[INDEX]]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add nsw i64 [[TMP0]], -1
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, i64* [[B]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, i64* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, i64* [[TMP3]], i32 -7
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i64* [[TMP4]] to <8 x i64>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i64>, <8 x i64>* [[TMP5]], align 8, !alias.scope !9
-; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <8 x i64> [[WIDE_LOAD]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[TMP6:%.*]] = add <8 x i64> [[REVERSE]], <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP1]]
-; CHECK-NEXT: [[REVERSE6:%.*]] = shufflevector <8 x i64> [[TMP6]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, i64* [[TMP7]], i32 0
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, i64* [[TMP8]], i32 -7
-; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64* [[TMP9]] to <8 x i64>*
-; CHECK-NEXT: store <8 x i64> [[REVERSE6]], <8 x i64>* [[TMP10]], align 8, !alias.scope !12, !noalias !9
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[N]], [[FOR_BODY_PREHEADER]] ], [ [[N]], [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.cond.cleanup.loopexit:
-; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: ret void
-; CHECK: for.body:
-; CHECK-NEXT: [[I_09_IN:%.*]] = phi i64 [ [[I_09:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[I_09]] = add nsw i64 [[I_09_IN]], -1
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[B]], i64 [[I_09]]
-; CHECK-NEXT: [[TMP12:%.*]] = load i64, i64* [[ARRAYIDX]], align 8
-; CHECK-NEXT: [[ADD:%.*]] = add i64 [[TMP12]], 1
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[I_09]]
-; CHECK-NEXT: store i64 [[ADD]], i64* [[ARRAYIDX2]], align 8
-; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i64 [[I_09_IN]], 1
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP15:![0-9]+]]
-;
+; CHECK-LABEL: vector_reverse_i64
+; CHECK-LABEL: vector.body
+; CHECK: %[[GEP:.*]] = getelementptr inbounds i64, i64* %{{.*}}, i32 0
+; CHECK-NEXT: %[[GEP1:.*]] = getelementptr inbounds i64, i64* %[[GEP]], i32 -7
+; CHECK-NEXT: %[[CAST:.*]] = bitcast i64* %[[GEP1]] to <8 x i64>*
+; CHECK-NEXT: %[[WIDE:.*]] = load <8 x i64>, <8 x i64>* %[[CAST]], align 8
+; CHECK-NEXT: %[[REVERSE:.*]] = shufflevector <8 x i64> %[[WIDE]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT: %[[FADD:.*]] = add <8 x i64> %[[REVERSE]]
+; CHECK-NEXT: %[[GEP2:.*]] = getelementptr inbounds i64, i64* {{.*}}, i64 {{.*}}
+; CHECK-NEXT: %[[REVERSE6:.*]] = shufflevector <8 x i64> %[[FADD]], <8 x i64> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT: %[[GEP3:.*]] = getelementptr inbounds i64, i64* %[[GEP2]], i32 0
+; CHECK-NEXT: %[[GEP4:.*]] = getelementptr inbounds i64, i64* %[[GEP3]], i32 -7
+; CHECK-NEXT: %[[CAST1:.*]] = bitcast i64* %[[GEP4]] to <8 x i64>*
+; CHECK-NEXT: store <8 x i64> %[[REVERSE6]], <8 x i64>* %[[CAST1]], align 8
entry:
%cmp8 = icmp sgt i64 %N, 0
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -loop-vectorize -mtriple=thumbv7s-apple-ios6.0.0 -S -enable-interleaved-mem-accesses=false < %s | FileCheck %s
target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
; CHECK-NOT: <4 x float>
define void @_Z4testmm(i32 %size, i32 %offset) {
-; CHECK-LABEL: @_Z4testmm(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP53:%.*]] = icmp eq i32 [[SIZE:%.*]], 0
-; CHECK-NEXT: br i1 [[CMP53]], label [[FOR_END:%.*]], label [[FOR_BODY_LR_PH:%.*]]
-; CHECK: for.body.lr.ph:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[R_057:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[ADD10:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[G_056:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[ADD20:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[V_055:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[B_054:%.*]] = phi float [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[ADD30:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ADD:%.*]] = add i32 [[V_055]], [[OFFSET:%.*]]
-; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[ADD]], 3
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i32 0, i32 [[MUL]]
-; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel, i32 0, i32 [[V_055]]
-; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[MUL3:%.*]] = fmul fast float [[TMP0]], [[TMP1]]
-; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel2, i32 0, i32 [[V_055]]
-; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX4]], align 4
-; CHECK-NEXT: [[MUL5:%.*]] = fmul fast float [[MUL3]], [[TMP2]]
-; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel3, i32 0, i32 [[V_055]]
-; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX6]], align 4
-; CHECK-NEXT: [[MUL7:%.*]] = fmul fast float [[MUL5]], [[TMP3]]
-; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds [512 x float], [512 x float]* @kernel4, i32 0, i32 [[V_055]]
-; CHECK-NEXT: [[TMP4:%.*]] = load float, float* [[ARRAYIDX8]], align 4
-; CHECK-NEXT: [[MUL9:%.*]] = fmul fast float [[MUL7]], [[TMP4]]
-; CHECK-NEXT: [[ADD10]] = fadd fast float [[R_057]], [[MUL9]]
-; CHECK-NEXT: [[ARRAYIDX_SUM:%.*]] = add i32 [[MUL]], 1
-; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i32 0, i32 [[ARRAYIDX_SUM]]
-; CHECK-NEXT: [[TMP5:%.*]] = load float, float* [[ARRAYIDX11]], align 4
-; CHECK-NEXT: [[MUL13:%.*]] = fmul fast float [[TMP1]], [[TMP5]]
-; CHECK-NEXT: [[MUL15:%.*]] = fmul fast float [[TMP2]], [[MUL13]]
-; CHECK-NEXT: [[MUL17:%.*]] = fmul fast float [[TMP3]], [[MUL15]]
-; CHECK-NEXT: [[MUL19:%.*]] = fmul fast float [[TMP4]], [[MUL17]]
-; CHECK-NEXT: [[ADD20]] = fadd fast float [[G_056]], [[MUL19]]
-; CHECK-NEXT: [[ARRAYIDX_SUM52:%.*]] = add i32 [[MUL]], 2
-; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds [1536 x float], [1536 x float]* @src_data, i32 0, i32 [[ARRAYIDX_SUM52]]
-; CHECK-NEXT: [[TMP6:%.*]] = load float, float* [[ARRAYIDX21]], align 4
-; CHECK-NEXT: [[MUL23:%.*]] = fmul fast float [[TMP1]], [[TMP6]]
-; CHECK-NEXT: [[MUL25:%.*]] = fmul fast float [[TMP2]], [[MUL23]]
-; CHECK-NEXT: [[MUL27:%.*]] = fmul fast float [[TMP3]], [[MUL25]]
-; CHECK-NEXT: [[MUL29:%.*]] = fmul fast float [[TMP4]], [[MUL27]]
-; CHECK-NEXT: [[ADD30]] = fadd fast float [[B_054]], [[MUL29]]
-; CHECK-NEXT: [[INC]] = add i32 [[V_055]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i32 [[INC]], [[SIZE]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]]
-; CHECK: for.cond.for.end_crit_edge:
-; CHECK-NEXT: [[ADD30_LCSSA:%.*]] = phi float [ [[ADD30]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ADD20_LCSSA:%.*]] = phi float [ [[ADD20]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ADD10_LCSSA:%.*]] = phi float [ [[ADD10]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[PHITMP:%.*]] = fptoui float [[ADD10_LCSSA]] to i8
-; CHECK-NEXT: [[PHITMP60:%.*]] = fptoui float [[ADD20_LCSSA]] to i8
-; CHECK-NEXT: [[PHITMP61:%.*]] = fptoui float [[ADD30_LCSSA]] to i8
-; CHECK-NEXT: br label [[FOR_END]]
-; CHECK: for.end:
-; CHECK-NEXT: [[R_0_LCSSA:%.*]] = phi i8 [ [[PHITMP]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[G_0_LCSSA:%.*]] = phi i8 [ [[PHITMP60]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT: [[B_0_LCSSA:%.*]] = phi i8 [ [[PHITMP61]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT: store i8 [[R_0_LCSSA]], i8* @r_, align 4
-; CHECK-NEXT: store i8 [[G_0_LCSSA]], i8* @g_, align 4
-; CHECK-NEXT: store i8 [[B_0_LCSSA]], i8* @b_, align 4
-; CHECK-NEXT: ret void
-;
entry:
%cmp53 = icmp eq i32 %size, 0
br i1 %cmp53, label %for.end, label %for.body.lr.ph
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 [[N]])
-; CHECK-NEXT: [[TMP1:%.*]] = mul nuw nsw <4 x i32> [[VEC_IND]], <i32 3, i32 3, i32 3, i32 3>
-; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw <4 x i32> [[TMP1]], <i32 2, i32 2, i32 2, i32 2>
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], <4 x i32> [[TMP2]]
-; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
-; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> <i32 5, i32 5, i32 5, i32 5>, [[WIDE_MASKED_GATHER]]
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[TMP0]]
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP4]], <4 x i32>* [[TMP7]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT: [[TMP4:%.*]] = mul nuw nsw <4 x i32> [[VEC_IND]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK-NEXT: [[TMP5:%.*]] = add nuw nsw <4 x i32> [[TMP4]], <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], <4 x i32> [[TMP5]]
+; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP6]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
+; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> <i32 5, i32 5, i32 5, i32 5>, [[WIDE_MASKED_GATHER]]
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[TMP0]]
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
+; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP7]], <4 x i32>* [[TMP10]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br i1 true, label [[END:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[MUL:%.*]] = mul nuw nsw i32 [[I_023]], 3
; CHECK-NEXT: [[ADD5:%.*]] = add nuw nsw i32 [[MUL]], 2
; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i32 [[ADD5]]
-; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
-; CHECK-NEXT: [[ADD7:%.*]] = add nsw i32 5, [[TMP9]]
+; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
+; CHECK-NEXT: [[ADD7:%.*]] = add nsw i32 5, [[TMP12]]
; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[DST]], i32 [[I_023]]
; CHECK-NEXT: store i32 [[ADD7]], i32* [[ARRAYIDX9]], align 4
; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_023]], 1
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 [[N]])
-; CHECK-NEXT: [[TMP1:%.*]] = mul nuw nsw <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw <4 x i32> [[TMP1]], <i32 2, i32 2, i32 2, i32 2>
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], <4 x i32> [[TMP2]]
-; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
-; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> <i32 5, i32 5, i32 5, i32 5>, [[WIDE_MASKED_GATHER]]
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[TMP0]]
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP4]], <4 x i32>* [[TMP7]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT: [[TMP4:%.*]] = mul nuw nsw <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT: [[TMP5:%.*]] = add nuw nsw <4 x i32> [[TMP4]], <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], <4 x i32> [[TMP5]]
+; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP6]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
+; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> <i32 5, i32 5, i32 5, i32 5>, [[WIDE_MASKED_GATHER]]
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[TMP0]]
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
+; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP7]], <4 x i32>* [[TMP10]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br i1 true, label [[END:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[MUL:%.*]] = mul nuw nsw i32 [[I_023]], 4
; CHECK-NEXT: [[ADD5:%.*]] = add nuw nsw i32 [[MUL]], 2
; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i32 [[ADD5]]
-; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
-; CHECK-NEXT: [[ADD7:%.*]] = add nsw i32 5, [[TMP9]]
+; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
+; CHECK-NEXT: [[ADD7:%.*]] = add nsw i32 5, [[TMP12]]
; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[DST]], i32 [[I_023]]
; CHECK-NEXT: store i32 [[ADD7]], i32* [[ARRAYIDX9]], align 4
; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_023]], 1
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 [[N]])
-; CHECK-NEXT: [[TMP1:%.*]] = mul nuw nsw i32 [[TMP0]], [[STRIDE]]
-; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 2
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], i32 [[TMP2]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP5]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
-; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> <i32 5, i32 5, i32 5, i32 5>, [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[TMP0]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
-; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP6]], <4 x i32>* [[TMP9]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP1]], i32 [[N]])
+; CHECK-NEXT: [[TMP2:%.*]] = mul nuw nsw i32 [[TMP1]], [[STRIDE]]
+; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i32 [[TMP2]], 2
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], i32 [[TMP3]]
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP6]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
+; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> <i32 5, i32 5, i32 5, i32 5>, [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[TMP1]]
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
+; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP7]], <4 x i32>* [[TMP10]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br i1 true, label [[END:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[MUL:%.*]] = mul nuw nsw i32 [[I_023]], [[STRIDE]]
; CHECK-NEXT: [[ADD5:%.*]] = add nuw nsw i32 [[MUL]], 2
; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i32 [[ADD5]]
-; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
-; CHECK-NEXT: [[ADD7:%.*]] = add nsw i32 5, [[TMP11]]
+; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
+; CHECK-NEXT: [[ADD7:%.*]] = add nsw i32 5, [[TMP12]]
; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[DST]], i32 [[I_023]]
; CHECK-NEXT: store i32 [[ADD7]], i32* [[ARRAYIDX9]], align 4
; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_023]], 1
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND2:%.*]] = phi <4 x i32> [ <i32 3, i32 11, i32 19, i32 27>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT: [[TMP2:%.*]] = mul nuw nsw <4 x i32> [[VEC_IND]], [[VEC_IND2]]
-; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw <4 x i32> [[TMP2]], <i32 2, i32 2, i32 2, i32 2>
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], <4 x i32> [[TMP3]]
-; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP4]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
-; CHECK-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> <i32 5, i32 5, i32 5, i32 5>, [[WIDE_MASKED_GATHER]]
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[TMP1]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
-; CHECK-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP8]], align 4
+; CHECK-NEXT: [[TMP5:%.*]] = mul nuw nsw <4 x i32> [[VEC_IND]], [[VEC_IND2]]
+; CHECK-NEXT: [[TMP6:%.*]] = add nuw nsw <4 x i32> [[TMP5]], <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], <4 x i32> [[TMP6]]
+; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP7]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> <i32 5, i32 5, i32 5, i32 5>, [[WIDE_MASKED_GATHER]]
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[TMP1]]
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
+; CHECK-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* [[TMP11]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
; CHECK-NEXT: [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], <i32 32, i32 32, i32 32, i32 32>
-; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
; CHECK-NEXT: [[MUL:%.*]] = mul nuw nsw i32 [[I_023]], [[STRIDE]]
; CHECK-NEXT: [[ADD5:%.*]] = add nuw nsw i32 [[MUL]], 2
; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i32 [[ADD5]]
-; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
-; CHECK-NEXT: [[ADD7:%.*]] = add nsw i32 5, [[TMP10]]
+; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
+; CHECK-NEXT: [[ADD7:%.*]] = add nsw i32 5, [[TMP13]]
; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[DST]], i32 [[I_023]]
; CHECK-NEXT: store i32 [[ADD7]], i32* [[ARRAYIDX9]], align 4
; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_023]], 1
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND4:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT5:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT: [[TMP4:%.*]] = mul nuw nsw <4 x i32> [[VEC_IND]], [[VEC_IND4]]
-; CHECK-NEXT: [[TMP5:%.*]] = add nuw nsw <4 x i32> [[TMP4]], <i32 2, i32 2, i32 2, i32 2>
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], <4 x i32> [[TMP5]]
-; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP6]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
-; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> <i32 5, i32 5, i32 5, i32 5>, [[WIDE_MASKED_GATHER]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[TMP3]]
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
-; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
-; CHECK-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP10]], align 4
+; CHECK-NEXT: [[TMP7:%.*]] = mul nuw nsw <4 x i32> [[VEC_IND]], [[VEC_IND4]]
+; CHECK-NEXT: [[TMP8:%.*]] = add nuw nsw <4 x i32> [[TMP7]], <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], <4 x i32> [[TMP8]]
+; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP9]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+; CHECK-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> <i32 5, i32 5, i32 5, i32 5>, [[WIDE_MASKED_GATHER]]
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[TMP3]]
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0
+; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>*
+; CHECK-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP13]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
; CHECK-NEXT: [[VEC_IND_NEXT5]] = add <4 x i32> [[VEC_IND4]], [[DOTSPLAT3]]
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
; CHECK-NEXT: [[MUL:%.*]] = mul nuw nsw i32 [[I_023]], [[STRIDE]]
; CHECK-NEXT: [[ADD5:%.*]] = add nuw nsw i32 [[MUL]], 2
; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[DATA]], i32 [[ADD5]]
-; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
-; CHECK-NEXT: [[ADD7:%.*]] = add nsw i32 5, [[TMP12]]
+; CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
+; CHECK-NEXT: [[ADD7:%.*]] = add nsw i32 5, [[TMP15]]
; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[DST]], i32 [[I_023]]
; CHECK-NEXT: store i32 [[ADD7]], i32* [[ARRAYIDX9]], align 4
; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_023]], 1
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -loop-vectorize -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s
; REQUIRES: asserts
br i1 %cmp, label %for.body, label %for.cond.cleanup
}
-attributes #0 = { "target-features"="+mve" }
+attributes #0 = { "target-features"="+mve" }
\ No newline at end of file
; CHECK-NEXT: store <16 x i8> [[TMP5]], <16 x i8>* [[TMP6]], align 1, !alias.scope !3, !noalias !0
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP5:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[BLOCKSIZE]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[SCALAR_PH]]
; CHECK-NEXT: store i8 [[COND11]], i8* [[PDST_ADDR_020]], align 1
; CHECK-NEXT: [[DEC]] = add i32 [[BLKCNT_021]], -1
; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0
-; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], [[LOOP7:!llvm.loop !.*]]
; CHECK: while.end:
; CHECK-NEXT: ret void
;
; CHECK-NEXT: store <8 x i16> [[TMP5]], <8 x i16>* [[TMP6]], align 2, !alias.scope !11, !noalias !8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP13:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[BLOCKSIZE]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[SCALAR_PH]]
; CHECK-NEXT: store i16 [[COND11]], i16* [[PDST_ADDR_021]], align 2
; CHECK-NEXT: [[DEC]] = add i32 [[BLKCNT_022]], -1
; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0
-; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], [[LOOP14:!llvm.loop !.*]]
; CHECK: while.end:
; CHECK-NEXT: ret void
;
; CHECK-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, !alias.scope !18, !noalias !15
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP20:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[BLOCKSIZE]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[SCALAR_PH]]
; CHECK-NEXT: store i32 [[COND6]], i32* [[PDST_ADDR_015]], align 4
; CHECK-NEXT: [[DEC]] = add i32 [[BLKCNT_016]], -1
; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0
-; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], [[LOOP21:!llvm.loop !.*]]
; CHECK: while.end:
; CHECK-NEXT: ret void
;
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_MASKED_LOAD]])
; CHECK-NEXT: [[TMP3]] = add i32 [[TMP2]], [[VEC_PHI]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
-; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
+; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[TMP10]] = add i32 [[TMP9]], [[TMP8]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
-; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260
+; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK: vector.ph:
; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[BLOCKSIZE]], 7
; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N_RND_UP]], -8
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <8 x i16> poison, i16 [[OFFSET:%.*]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT6]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <8 x i16> poison, i16 [[OFFSET:%.*]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT8]], <8 x i16> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[BLOCKSIZE]])
; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[NEXT_GEP]] to <8 x i16>*
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* [[TMP0]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
-; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[WIDE_MASKED_LOAD]], <8 x i16> [[BROADCAST_SPLAT7]])
+; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[WIDE_MASKED_LOAD]], <8 x i16> [[BROADCAST_SPLAT9]])
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[NEXT_GEP5]] to <8 x i16>*
; CHECK-NEXT: call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> [[TMP1]], <8 x i16>* [[TMP2]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]])
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8
; CHECK-NEXT: [[TMP16]] = fsub fast <2 x double> [[VEC_PHI]], [[TMP15]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: [[TMP18:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[TMP16]])
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[T]], [[N_VEC]]
; CHECK-NEXT: [[SUB127]] = fsub fast double [[DVAL1_4131]], [[MUL126]]
; CHECK-NEXT: [[INC129]] = add nuw nsw i32 [[I_2132]], 1
; CHECK-NEXT: [[EXITCOND143:%.*]] = icmp eq i32 [[INC129]], [[T]]
-; CHECK-NEXT: br i1 [[EXITCOND143]], label [[OUTEREND]], label [[INNERLOOP]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND143]], label [[OUTEREND]], label [[INNERLOOP]], [[LOOP2:!llvm.loop !.*]]
; CHECK: outerend:
; CHECK-NEXT: [[SUB127_LCSSA:%.*]] = phi double [ [[SUB127]], [[INNERLOOP]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: [[CONV138:%.*]] = fptosi double [[SUB127_LCSSA]] to i32
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve.fp -loop-vectorize -tail-predication=enabled -S < %s | \
; RUN: FileCheck %s
; Test that ARMTTIImpl::preferPredicateOverEpilogue triggers tail-folding.
define dso_local void @f1(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) {
-; CHECK-LABEL: @f1(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; CHECK-NEXT: br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK: for.body.preheader:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 [[N]])
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[TMP0]]
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i32 [[TMP0]]
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP6]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
-; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]]
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
-; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
-; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP7]], <4 x i32>* [[TMP10]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.cond.cleanup.loopexit:
-; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: ret void
-; CHECK: for.body:
-; CHECK-NEXT: [[I_09:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[I_09]]
-; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[C]], i32 [[I_09]]
-; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4
-; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP12]]
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[I_09]]
-; CHECK-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_09]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-;
+; CHECK-LABEL: f1(
+; CHECK: entry:
+; CHECK: @llvm.get.active.lane.mask
+; CHECK: }
entry:
%cmp8 = icmp sgt i32 %N, 0
br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup
}
define dso_local void @f32_reduction(float* nocapture readonly %Input, i32 %N, float* nocapture %Output) {
-; CHECK-LABEL: @f32_reduction(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP6:%.*]] = icmp eq i32 [[N:%.*]], 0
-; CHECK-NEXT: br i1 [[CMP6]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
-; CHECK: while.body.preheader:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT: [[IND_END:%.*]] = sub i32 [[N]], [[N_VEC]]
-; CHECK-NEXT: [[IND_END2:%.*]] = getelementptr float, float* [[INPUT:%.*]], i32 [[N_VEC]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-LABEL: f32_reduction(
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr float, float* [[INPUT]], i32 [[TMP0]]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[INDEX]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[VEC_IV:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[VEC_IV]], i32 0
-; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP1]], i32 [[N]])
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, float* [[NEXT_GEP]], i32 0
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> poison)
-; CHECK-NEXT: [[TMP4:%.*]] = fadd fast <4 x float> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> [[TMP4]], <4 x float> [[VEC_PHI]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP5]])
-; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[N]], [[WHILE_BODY_PREHEADER]] ]
-; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi float* [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[INPUT]], [[WHILE_BODY_PREHEADER]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[WHILE_BODY_PREHEADER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[WHILE_BODY:%.*]]
-; CHECK: while.body:
-; CHECK-NEXT: [[BLKCNT_09:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[SUM_08:%.*]] = phi float [ [[ADD:%.*]], [[WHILE_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[INPUT_ADDR_07:%.*]] = phi float* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds float, float* [[INPUT_ADDR_07]], i32 1
-; CHECK-NEXT: [[TMP8:%.*]] = load float, float* [[INPUT_ADDR_07]], align 4
-; CHECK-NEXT: [[ADD]] = fadd fast float [[TMP8]], [[SUM_08]]
-; CHECK-NEXT: [[DEC]] = add i32 [[BLKCNT_09]], -1
-; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[DEC]], 0
-; CHECK-NEXT: br i1 [[CMP]], label [[WHILE_END_LOOPEXIT]], label [[WHILE_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK: while.end.loopexit:
-; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[WHILE_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[WHILE_END]]
-; CHECK: while.end:
-; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[WHILE_END_LOOPEXIT]] ]
-; CHECK-NEXT: [[CONV:%.*]] = uitofp i32 [[N]] to float
-; CHECK-NEXT: [[DIV:%.*]] = fdiv fast float [[SUM_0_LCSSA]], [[CONV]]
-; CHECK-NEXT: store float [[DIV]], float* [[OUTPUT:%.*]], align 4
-; CHECK-NEXT: ret void
-;
+; CHECK: @llvm.masked.load
+; CHECK: br i1 %{{.*}}, label {{.*}}, label %vector.body
entry:
%cmp6 = icmp eq i32 %N, 0
br i1 %cmp6, label %while.end, label %while.body.preheader
}
define dso_local void @f16_reduction(half* nocapture readonly %Input, i32 %N, half* nocapture %Output) {
-; CHECK-LABEL: @f16_reduction(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP6:%.*]] = icmp eq i32 [[N:%.*]], 0
-; CHECK-NEXT: br i1 [[CMP6]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
-; CHECK: while.body.preheader:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 7
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 8
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT: [[IND_END:%.*]] = sub i32 [[N]], [[N_VEC]]
-; CHECK-NEXT: [[IND_END2:%.*]] = getelementptr half, half* [[INPUT:%.*]], i32 [[N_VEC]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-LABEL: f16_reduction(
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x half> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr half, half* [[INPUT]], i32 [[TMP0]]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> poison, i32 [[INDEX]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT: [[VEC_IV:%.*]] = add <8 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i32> [[VEC_IV]], i32 0
-; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[TMP1]], i32 [[N]])
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr half, half* [[NEXT_GEP]], i32 0
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast half* [[TMP2]] to <8 x half>*
-; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* [[TMP3]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x half> poison)
-; CHECK-NEXT: [[TMP4:%.*]] = fadd fast <8 x half> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP5]] = select <8 x i1> [[ACTIVE_LANE_MASK]], <8 x half> [[TMP4]], <8 x half> [[VEC_PHI]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[TMP7:%.*]] = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH8000, <8 x half> [[TMP5]])
-; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[N]], [[WHILE_BODY_PREHEADER]] ]
-; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi half* [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[INPUT]], [[WHILE_BODY_PREHEADER]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi half [ 0xH0000, [[WHILE_BODY_PREHEADER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[WHILE_BODY:%.*]]
-; CHECK: while.body:
-; CHECK-NEXT: [[BLKCNT_09:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[SUM_08:%.*]] = phi half [ [[ADD:%.*]], [[WHILE_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[INPUT_ADDR_07:%.*]] = phi half* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds half, half* [[INPUT_ADDR_07]], i32 1
-; CHECK-NEXT: [[TMP8:%.*]] = load half, half* [[INPUT_ADDR_07]], align 2
-; CHECK-NEXT: [[ADD]] = fadd fast half [[TMP8]], [[SUM_08]]
-; CHECK-NEXT: [[DEC]] = add i32 [[BLKCNT_09]], -1
-; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[DEC]], 0
-; CHECK-NEXT: br i1 [[CMP]], label [[WHILE_END_LOOPEXIT]], label [[WHILE_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK: while.end.loopexit:
-; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi half [ [[ADD]], [[WHILE_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[WHILE_END]]
-; CHECK: while.end:
-; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi half [ 0xH0000, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[WHILE_END_LOOPEXIT]] ]
-; CHECK-NEXT: [[CONV:%.*]] = uitofp i32 [[N]] to half
-; CHECK-NEXT: [[DIV:%.*]] = fdiv fast half [[SUM_0_LCSSA]], [[CONV]]
-; CHECK-NEXT: store half [[DIV]], half* [[OUTPUT:%.*]], align 2
-; CHECK-NEXT: ret void
-;
+; CHECK: @llvm.masked.load
+; CHECK: br i1 %{{.*}}, label {{.*}}, label %vector.body
entry:
%cmp6 = icmp eq i32 %N, 0
br i1 %cmp6, label %while.end, label %while.body.preheader
}
define dso_local void @mixed_f32_i32_reduction(float* nocapture readonly %fInput, i32* nocapture readonly %iInput, i32 %N, float* nocapture %fOutput, i32* nocapture %iOutput) {
-; CHECK-LABEL: @mixed_f32_i32_reduction(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP15:%.*]] = icmp eq i32 [[N:%.*]], 0
-; CHECK-NEXT: br i1 [[CMP15]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
-; CHECK: while.body.preheader:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT: [[IND_END:%.*]] = sub i32 [[N]], [[N_VEC]]
-; CHECK-NEXT: [[IND_END2:%.*]] = getelementptr float, float* [[FINPUT:%.*]], i32 [[N_VEC]]
-; CHECK-NEXT: [[IND_END4:%.*]] = getelementptr i32, i32* [[IINPUT:%.*]], i32 [[N_VEC]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-LABEL: mixed_f32_i32_reduction(
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI5:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr float, float* [[FINPUT]], i32 [[TMP0]]
-; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i32, i32* [[IINPUT]], i32 [[TMP1]]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[INDEX]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[VEC_IV:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[VEC_IV]], i32 0
-; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP2]], i32 [[N]])
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i32, i32* [[NEXT_GEP6]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP4]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
-; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[WIDE_MASKED_LOAD]], <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]])
-; CHECK-NEXT: [[TMP7]] = add i32 [[TMP6]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr float, float* [[NEXT_GEP]], i32 0
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_MASKED_LOAD7:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP9]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> poison)
-; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <4 x float> [[WIDE_MASKED_LOAD7]], [[VEC_PHI5]]
-; CHECK-NEXT: [[TMP11]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> [[TMP10]], <4 x float> [[VEC_PHI5]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP11]])
-; CHECK-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[N]], [[WHILE_BODY_PREHEADER]] ]
-; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi float* [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[FINPUT]], [[WHILE_BODY_PREHEADER]] ]
-; CHECK-NEXT: [[BC_RESUME_VAL3:%.*]] = phi i32* [ [[IND_END4]], [[MIDDLE_BLOCK]] ], [ [[IINPUT]], [[WHILE_BODY_PREHEADER]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[WHILE_BODY_PREHEADER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX8:%.*]] = phi float [ 0.000000e+00, [[WHILE_BODY_PREHEADER]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[WHILE_BODY:%.*]]
-; CHECK: while.body:
-; CHECK-NEXT: [[BLKCNT_020:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[ISUM_019:%.*]] = phi i32 [ [[ADD2:%.*]], [[WHILE_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[FSUM_018:%.*]] = phi float [ [[ADD:%.*]], [[WHILE_BODY]] ], [ [[BC_MERGE_RDX8]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[FINPUT_ADDR_017:%.*]] = phi float* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[IINPUT_ADDR_016:%.*]] = phi i32* [ [[INCDEC_PTR1:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL3]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds float, float* [[FINPUT_ADDR_017]], i32 1
-; CHECK-NEXT: [[INCDEC_PTR1]] = getelementptr inbounds i32, i32* [[IINPUT_ADDR_016]], i32 1
-; CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[IINPUT_ADDR_016]], align 4
-; CHECK-NEXT: [[ADD2]] = add nsw i32 [[TMP14]], [[ISUM_019]]
-; CHECK-NEXT: [[TMP15:%.*]] = load float, float* [[FINPUT_ADDR_017]], align 4
-; CHECK-NEXT: [[ADD]] = fadd fast float [[TMP15]], [[FSUM_018]]
-; CHECK-NEXT: [[DEC]] = add i32 [[BLKCNT_020]], -1
-; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[DEC]], 0
-; CHECK-NEXT: br i1 [[CMP]], label [[WHILE_END_LOOPEXIT]], label [[WHILE_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK: while.end.loopexit:
-; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[WHILE_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: [[ADD2_LCSSA:%.*]] = phi i32 [ [[ADD2]], [[WHILE_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: [[PHITMP:%.*]] = sitofp i32 [[ADD2_LCSSA]] to float
-; CHECK-NEXT: br label [[WHILE_END]]
-; CHECK: while.end:
-; CHECK-NEXT: [[FSUM_0_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[WHILE_END_LOOPEXIT]] ]
-; CHECK-NEXT: [[ISUM_0_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[PHITMP]], [[WHILE_END_LOOPEXIT]] ]
-; CHECK-NEXT: [[CONV:%.*]] = uitofp i32 [[N]] to float
-; CHECK-NEXT: [[DIV:%.*]] = fdiv fast float [[FSUM_0_LCSSA]], [[CONV]]
-; CHECK-NEXT: store float [[DIV]], float* [[FOUTPUT:%.*]], align 4
-; CHECK-NEXT: [[DIV5:%.*]] = fdiv fast float [[ISUM_0_LCSSA]], [[CONV]]
-; CHECK-NEXT: [[CONV6:%.*]] = fptosi float [[DIV5]] to i32
-; CHECK-NEXT: store i32 [[CONV6]], i32* [[IOUTPUT:%.*]], align 4
-; CHECK-NEXT: ret void
-;
+; CHECK: @llvm.masked.load
+; CHECK: br i1 %{{.*}}, label {{.*}}, label %vector.body
entry:
%cmp15 = icmp eq i32 %N, 0
br i1 %cmp15, label %while.end, label %while.body.preheader
}
define dso_local i32 @i32_mul_reduction(i32* noalias nocapture readonly %B, i32 %N) {
-; CHECK-LABEL: @i32_mul_reduction(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK: for.body.preheader:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-LABEL: i32_mul_reduction(
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 1, i32 1, i32 1, i32 1>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 [[N]])
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[TMP0]]
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
-; CHECK-NEXT: [[TMP4:%.*]] = mul <4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP4]], <4 x i32> [[VEC_PHI]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP5]])
-; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 1, [[FOR_BODY_PREHEADER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.cond.cleanup.loopexit:
-; CHECK-NEXT: [[MUL_LCSSA:%.*]] = phi i32 [ [[MUL:%.*]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: [[S_0_LCSSA:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[MUL_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; CHECK-NEXT: ret i32 [[S_0_LCSSA]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[S_07:%.*]] = phi i32 [ [[MUL]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[I_08]]
-; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[MUL]] = mul nsw i32 [[TMP8]], [[S_07]]
-; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-;
+; CHECK: @llvm.masked.load
+; CHECK: br i1 %{{.*}}, label {{.*}}, label %vector.body
entry:
%cmp6 = icmp sgt i32 %N, 0
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
}
define dso_local i32 @i32_or_reduction(i32* noalias nocapture readonly %B, i32 %N) {
-; CHECK-LABEL: @i32_or_reduction(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK: for.body.preheader:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-LABEL: i32_or_reduction(
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 1, i32 0, i32 0, i32 0>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 [[N]])
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[TMP0]]
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
-; CHECK-NEXT: [[TMP4:%.*]] = or <4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP4]], <4 x i32> [[VEC_PHI]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP5]])
-; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 1, [[FOR_BODY_PREHEADER]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.cond.cleanup.loopexit:
-; CHECK-NEXT: [[OR_LCSSA:%.*]] = phi i32 [ [[OR:%.*]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: [[S_0_LCSSA:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ], [ [[OR_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; CHECK-NEXT: ret i32 [[S_0_LCSSA]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[S_07:%.*]] = phi i32 [ [[OR]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[I_08]]
-; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[OR]] = or i32 [[TMP8]], [[S_07]]
-; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-;
+; CHECK: @llvm.masked.load
+; CHECK: br i1 %{{.*}}, label {{.*}}, label %vector.body
entry:
%cmp6 = icmp sgt i32 %N, 0
br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
}
define dso_local i32 @i32_and_reduction(i32* noalias nocapture readonly %A, i32 %N, i32 %S) {
-; CHECK-LABEL: @i32_and_reduction(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP5:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; CHECK-NEXT: br i1 [[CMP5]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK: for.body.preheader:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, i32 [[S:%.*]], i32 0
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-LABEL: i32_and_reduction(
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP1]], i32 [[N]])
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP4]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
-; CHECK-NEXT: [[TMP5:%.*]] = and <4 x i32> [[WIDE_MASKED_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP6]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP5]], <4 x i32> [[VEC_PHI]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP6]])
-; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[S]], [[FOR_BODY_PREHEADER]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.cond.cleanup.loopexit:
-; CHECK-NEXT: [[AND_LCSSA:%.*]] = phi i32 [ [[AND:%.*]], [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: [[S_ADDR_0_LCSSA:%.*]] = phi i32 [ [[S]], [[ENTRY:%.*]] ], [ [[AND_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; CHECK-NEXT: ret i32 [[S_ADDR_0_LCSSA]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I_07:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[S_ADDR_06:%.*]] = phi i32 [ [[AND]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[I_07]]
-; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[AND]] = and i32 [[TMP9]], [[S_ADDR_06]]
-; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_07]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
-;
+; CHECK: @llvm.masked.load
+; CHECK: br i1 %{{.*}}, label {{.*}}, label %vector.body
entry:
%cmp5 = icmp sgt i32 %N, 0
br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve.fp -loop-vectorize -tail-predication=enabled -S < %s | \
; RUN: FileCheck %s
; preferPredicateOverEpilogue: hardware-loop is not profitable.
;
define dso_local void @tail_folding(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) {
-; CHECK-LABEL: @tail_folding(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-LABEL: tail_folding(
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4
-; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
-; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
-; CHECK-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP10]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 428
-; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 430, 428
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 428, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: ret void
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP12]]
-; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 430
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-;
+; CHECK-NOT: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(
+; CHECK-NOT: call void @llvm.masked.store.v4i32.p0v4i32(
+; CHECK: br i1 %{{.*}}, label %{{.*}}, label %vector.body
entry:
br label %for.body
; tail-folded.
;
define dso_local void @predicate_loop_hint(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) {
-; CHECK-LABEL: @predicate_loop_hint(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-LABEL: predicate_loop_hint(
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[TMP0]], i64 430)
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP6]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
-; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
-; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
-; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP7]], <4 x i32>* [[TMP10]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 432
-; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 432, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: ret void
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP12]]
-; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 430
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-;
+; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK: %[[ELEM0:.*]] = add i64 %index, 0
+; CHECK: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 %[[ELEM0]], i64 430)
+; CHECK: %[[WML1:.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}<4 x i1> %active.lane.mask
+; CHECK: %[[WML2:.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}<4 x i1> %active.lane.mask
+; CHECK: %[[ADD:.*]] = add nsw <4 x i32> %[[WML2]], %[[WML1]]
+; CHECK: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %[[ADD]], {{.*}}<4 x i1> %active.lane.mask
+; CHECK: %index.next = add i64 %index, 4
+; CHECK: br i1 %{{.*}}, label %{{.*}}, label %vector.body
entry:
br label %for.body
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve.fp -loop-vectorize -tail-predication=enabled -S < %s | \
; RUN: FileCheck %s -check-prefix=CHECK
; overrules this.
;
define dso_local void @flag_overrules_hint(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) local_unnamed_addr #0 {
-; CHECK-LABEL: @flag_overrules_hint(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-LABEL: flag_overrules_hint(
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4
-; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
-; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
-; CHECK-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP10]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 428
-; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 430, 428
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 428, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: ret void
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP12]]
-; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 430
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-;
-; PREDFLAG-LABEL: @flag_overrules_hint(
-; PREDFLAG-NEXT: entry:
-; PREDFLAG-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; PREDFLAG: vector.ph:
-; PREDFLAG-NEXT: br label [[VECTOR_BODY:%.*]]
-; PREDFLAG: vector.body:
-; PREDFLAG-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDFLAG-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; PREDFLAG-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[TMP0]], i64 430)
-; PREDFLAG-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]]
-; PREDFLAG-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; PREDFLAG-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; PREDFLAG-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
-; PREDFLAG-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[TMP0]]
-; PREDFLAG-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
-; PREDFLAG-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
-; PREDFLAG-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP6]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
-; PREDFLAG-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]]
-; PREDFLAG-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]]
-; PREDFLAG-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
-; PREDFLAG-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
-; PREDFLAG-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP7]], <4 x i32>* [[TMP10]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
-; PREDFLAG-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; PREDFLAG-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 432
-; PREDFLAG-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; PREDFLAG: middle.block:
-; PREDFLAG-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; PREDFLAG: scalar.ph:
-; PREDFLAG-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 432, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; PREDFLAG-NEXT: br label [[FOR_BODY:%.*]]
-; PREDFLAG: for.cond.cleanup:
-; PREDFLAG-NEXT: ret void
-; PREDFLAG: for.body:
-; PREDFLAG-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; PREDFLAG-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
-; PREDFLAG-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; PREDFLAG-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]]
-; PREDFLAG-NEXT: [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
-; PREDFLAG-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP12]]
-; PREDFLAG-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; PREDFLAG-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4
-; PREDFLAG-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; PREDFLAG-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 430
-; PREDFLAG-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-;
+; CHECK-NOT: @llvm.masked.load.v8i32.p0v8i32(
+; CHECK-NOT: @llvm.masked.store.v8i32.p0v8i32(
+; CHECK: br i1 %{{.*}}, label {{.*}}, label %vector.body
+; PREDFLAG-LABEL: flag_overrules_hint(
+; PREDFLAG: vector.body:
+; PREDFLAG: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; PREDFLAG: %[[ELEM0:.*]] = add i64 %index, 0
+; PREDFLAG: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 %[[ELEM0]], i64 430)
+; PREDFLAG: %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask
+; PREDFLAG: %wide.masked.load1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask
+; PREDFLAG: %{{.*}} = add nsw <4 x i32> %wide.masked.load1, %wide.masked.load
+; PREDFLAG: call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask
+; PREDFLAG: %index.next = add i64 %index, 4
+; PREDFLAG: %[[CMP:.*]] = icmp eq i64 %index.next, 432
+; PREDFLAG: br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !0
entry:
br label %for.body
}
define dso_local void @interleave4(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
-; CHECK-LABEL: @interleave4(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; CHECK-NEXT: br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK: for.body.preheader:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 15
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 16
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 4
-; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 8
-; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[INDEX]], 12
-; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 [[N]])
-; CHECK-NEXT: [[ACTIVE_LANE_MASK1:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP1]], i32 [[N]])
-; CHECK-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP2]], i32 [[N]])
-; CHECK-NEXT: [[ACTIVE_LANE_MASK3:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP3]], i32 [[N]])
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[TMP0]]
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[TMP1]]
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[TMP2]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[TMP3]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP9]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 4
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP11]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK1]], <4 x i32> poison)
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 8
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP13]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK2]], <4 x i32> poison)
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 12
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP15]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK3]], <4 x i32> poison)
-; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i32 [[TMP0]]
-; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[C]], i32 [[TMP1]]
-; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[C]], i32 [[TMP2]]
-; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[C]], i32 [[TMP3]]
-; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP16]], i32 0
-; CHECK-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_MASKED_LOAD7:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP21]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
-; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP16]], i32 4
-; CHECK-NEXT: [[TMP23:%.*]] = bitcast i32* [[TMP22]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP23]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK1]], <4 x i32> poison)
-; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP16]], i32 8
-; CHECK-NEXT: [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP25]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK2]], <4 x i32> poison)
-; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP16]], i32 12
-; CHECK-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP27]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK3]], <4 x i32> poison)
-; CHECK-NEXT: [[TMP28:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD7]], [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT: [[TMP29:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD8]], [[WIDE_MASKED_LOAD4]]
-; CHECK-NEXT: [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD5]]
-; CHECK-NEXT: [[TMP31:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD10]], [[WIDE_MASKED_LOAD6]]
-; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]]
-; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP1]]
-; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP2]]
-; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP3]]
-; CHECK-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i32 0
-; CHECK-NEXT: [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
-; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP28]], <4 x i32>* [[TMP37]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i32 4
-; CHECK-NEXT: [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
-; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP29]], <4 x i32>* [[TMP39]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK1]])
-; CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i32 8
-; CHECK-NEXT: [[TMP41:%.*]] = bitcast i32* [[TMP40]] to <4 x i32>*
-; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP30]], <4 x i32>* [[TMP41]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK2]])
-; CHECK-NEXT: [[TMP42:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i32 12
-; CHECK-NEXT: [[TMP43:%.*]] = bitcast i32* [[TMP42]] to <4 x i32>*
-; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP31]], <4 x i32>* [[TMP43]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK3]])
-; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16
-; CHECK-NEXT: [[TMP44:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP44]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.cond.cleanup.loopexit:
-; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: ret void
-; CHECK: for.body:
-; CHECK-NEXT: [[I_09:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[I_09]]
-; CHECK-NEXT: [[TMP45:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[C]], i32 [[I_09]]
-; CHECK-NEXT: [[TMP46:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4
-; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP46]], [[TMP45]]
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[I_09]]
-; CHECK-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_09]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; PREDFLAG-LABEL: interleave4(
+; PREDFLAG: %[[ADD1:.*]] = add i32 %index, 0
+; PREDFLAG: %[[ADD2:.*]] = add i32 %index, 4
+; PREDFLAG: %[[ADD3:.*]] = add i32 %index, 8
+; PREDFLAG: %[[ADD4:.*]] = add i32 %index, 12
+; PREDFLAG: %[[ALM1:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD1]], i32 %N)
+; PREDFLAG: %[[ALM2:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD2]], i32 %N)
+; PREDFLAG: %[[ALM3:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD3]], i32 %N)
+; PREDFLAG: %[[ALM4:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD4]], i32 %N)
+;
+; PREDFLAG: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM1]],{{.*}}
+; PREDFLAG: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM2]],{{.*}}
+; PREDFLAG: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM3]],{{.*}}
+; PREDFLAG: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM4]],{{.*}}
+; PREDFLAG: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM1]],{{.*}}
+; PREDFLAG: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM2]],{{.*}}
+; PREDFLAG: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM3]],{{.*}}
+; PREDFLAG: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM4]],{{.*}}
;
-; PREDFLAG-LABEL: @interleave4(
-; PREDFLAG-NEXT: entry:
-; PREDFLAG-NEXT: [[CMP8:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; PREDFLAG-NEXT: br i1 [[CMP8]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; PREDFLAG: for.body.preheader:
-; PREDFLAG-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; PREDFLAG: vector.ph:
-; PREDFLAG-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 15
-; PREDFLAG-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 16
-; PREDFLAG-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; PREDFLAG-NEXT: br label [[VECTOR_BODY:%.*]]
-; PREDFLAG: vector.body:
-; PREDFLAG-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDFLAG-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; PREDFLAG-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 4
-; PREDFLAG-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 8
-; PREDFLAG-NEXT: [[TMP3:%.*]] = add i32 [[INDEX]], 12
-; PREDFLAG-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 [[N]])
-; PREDFLAG-NEXT: [[ACTIVE_LANE_MASK1:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP1]], i32 [[N]])
-; PREDFLAG-NEXT: [[ACTIVE_LANE_MASK2:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP2]], i32 [[N]])
-; PREDFLAG-NEXT: [[ACTIVE_LANE_MASK3:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP3]], i32 [[N]])
-; PREDFLAG-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[TMP0]]
-; PREDFLAG-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[TMP1]]
-; PREDFLAG-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[TMP2]]
-; PREDFLAG-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[TMP3]]
-; PREDFLAG-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
-; PREDFLAG-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
-; PREDFLAG-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP9]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
-; PREDFLAG-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 4
-; PREDFLAG-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
-; PREDFLAG-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP11]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK1]], <4 x i32> poison)
-; PREDFLAG-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 8
-; PREDFLAG-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>*
-; PREDFLAG-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP13]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK2]], <4 x i32> poison)
-; PREDFLAG-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 12
-; PREDFLAG-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <4 x i32>*
-; PREDFLAG-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP15]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK3]], <4 x i32> poison)
-; PREDFLAG-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i32 [[TMP0]]
-; PREDFLAG-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[C]], i32 [[TMP1]]
-; PREDFLAG-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[C]], i32 [[TMP2]]
-; PREDFLAG-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[C]], i32 [[TMP3]]
-; PREDFLAG-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP16]], i32 0
-; PREDFLAG-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <4 x i32>*
-; PREDFLAG-NEXT: [[WIDE_MASKED_LOAD7:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP21]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
-; PREDFLAG-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP16]], i32 4
-; PREDFLAG-NEXT: [[TMP23:%.*]] = bitcast i32* [[TMP22]] to <4 x i32>*
-; PREDFLAG-NEXT: [[WIDE_MASKED_LOAD8:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP23]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK1]], <4 x i32> poison)
-; PREDFLAG-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP16]], i32 8
-; PREDFLAG-NEXT: [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <4 x i32>*
-; PREDFLAG-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP25]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK2]], <4 x i32> poison)
-; PREDFLAG-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP16]], i32 12
-; PREDFLAG-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
-; PREDFLAG-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP27]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK3]], <4 x i32> poison)
-; PREDFLAG-NEXT: [[TMP28:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD7]], [[WIDE_MASKED_LOAD]]
-; PREDFLAG-NEXT: [[TMP29:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD8]], [[WIDE_MASKED_LOAD4]]
-; PREDFLAG-NEXT: [[TMP30:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD5]]
-; PREDFLAG-NEXT: [[TMP31:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD10]], [[WIDE_MASKED_LOAD6]]
-; PREDFLAG-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]]
-; PREDFLAG-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP1]]
-; PREDFLAG-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP2]]
-; PREDFLAG-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP3]]
-; PREDFLAG-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i32 0
-; PREDFLAG-NEXT: [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <4 x i32>*
-; PREDFLAG-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP28]], <4 x i32>* [[TMP37]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
-; PREDFLAG-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i32 4
-; PREDFLAG-NEXT: [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
-; PREDFLAG-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP29]], <4 x i32>* [[TMP39]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK1]])
-; PREDFLAG-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i32 8
-; PREDFLAG-NEXT: [[TMP41:%.*]] = bitcast i32* [[TMP40]] to <4 x i32>*
-; PREDFLAG-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP30]], <4 x i32>* [[TMP41]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK2]])
-; PREDFLAG-NEXT: [[TMP42:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i32 12
-; PREDFLAG-NEXT: [[TMP43:%.*]] = bitcast i32* [[TMP42]] to <4 x i32>*
-; PREDFLAG-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP31]], <4 x i32>* [[TMP43]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK3]])
-; PREDFLAG-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 16
-; PREDFLAG-NEXT: [[TMP44:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; PREDFLAG-NEXT: br i1 [[TMP44]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; PREDFLAG: middle.block:
-; PREDFLAG-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; PREDFLAG: scalar.ph:
-; PREDFLAG-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; PREDFLAG-NEXT: br label [[FOR_BODY:%.*]]
-; PREDFLAG: for.cond.cleanup.loopexit:
-; PREDFLAG-NEXT: br label [[FOR_COND_CLEANUP]]
-; PREDFLAG: for.cond.cleanup:
-; PREDFLAG-NEXT: ret void
-; PREDFLAG: for.body:
-; PREDFLAG-NEXT: [[I_09:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; PREDFLAG-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[I_09]]
-; PREDFLAG-NEXT: [[TMP45:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; PREDFLAG-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[C]], i32 [[I_09]]
-; PREDFLAG-NEXT: [[TMP46:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4
-; PREDFLAG-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP46]], [[TMP45]]
-; PREDFLAG-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[I_09]]
-; PREDFLAG-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
-; PREDFLAG-NEXT: [[INC]] = add nuw nsw i32 [[I_09]], 1
-; PREDFLAG-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]]
-; PREDFLAG-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; PREDFLAG: call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM1]])
+; PREDFLAG: call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM2]])
+; PREDFLAG: call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM3]])
+; PREDFLAG: call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM4]])
;
entry:
%cmp8 = icmp sgt i32 %N, 0
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[PTR]], i32 [[TMP0]]
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP]], i32 1
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[TMP3]], align 1
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* [[NEXT_GEP]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <16 x i8>*
-; CHECK-NEXT: store <16 x i8> [[WIDE_LOAD]], <16 x i8>* [[TMP5]], align 1
+; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[PTR]], i32 [[TMP1]]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP]], i32 1
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <16 x i8>*
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[TMP4]], align 1
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, i8* [[NEXT_GEP]], i32 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <16 x i8>*
+; CHECK-NEXT: store <16 x i8> [[WIDE_LOAD]], <16 x i8>* [[TMP6]], align 1
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[SIZE]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
; CHECK-NEXT: [[BUFF:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, i8* [[BUFF]], i32 1
; CHECK-NEXT: [[DEC]] = add nsw i32 [[DEC66]], -1
-; CHECK-NEXT: [[TMP7:%.*]] = load i8, i8* [[INCDEC_PTR]], align 1
-; CHECK-NEXT: store i8 [[TMP7]], i8* [[BUFF]], align 1
+; CHECK-NEXT: [[TMP8:%.*]] = load i8, i8* [[INCDEC_PTR]], align 1
+; CHECK-NEXT: store i8 [[TMP8]], i8* [[BUFF]], align 1
; CHECK-NEXT: [[TOBOOL11:%.*]] = icmp eq i32 [[DEC]], 0
; CHECK-NEXT: br i1 [[TOBOOL11]], label [[END]], label [[BODY]], !llvm.loop [[LOOP2:![0-9]+]]
; CHECK: end:
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -vector-library=MASSV -inject-tli-mappings -loop-vectorize -force-vector-interleave=1 -mattr=-altivec -S < %s | FileCheck %s
-target datalayout = "e-m:e-i64:64-n32:64"
+target datalayout = "e-m:e-i64:64-n32:64"
target triple = "powerpc64le-unknown-linux-gnu"
declare double @cbrt(double) #0
; Check that massv entries are not generated.
define void @cbrt_f64(double* nocapture %varray) {
; CHECK-LABEL: @cbrt_f64(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT: [[CALL:%.*]] = tail call double @cbrt(double [[CONV]]) #[[ATTR2:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[IV]]
-; CHECK-NEXT: store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK-NOT: __cbrtd2{{.*}}<2 x double>
+; CHECK: ret void
;
entry:
br label %for.body
define void @cbrt_f32(float* nocapture %varray) {
; CHECK-LABEL: @cbrt_f32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT: [[CALL:%.*]] = tail call float @cbrtf(float [[CONV]]) #[[ATTR3:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[IV]]
-; CHECK-NEXT: store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK-NOT: __cbrtf4{{.*}}<4 x float>
+; CHECK: ret void
;
entry:
br label %for.body
define void @atanh_f64(double* nocapture %varray) {
; CHECK-LABEL: @atanh_f64(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT: [[CALL:%.*]] = tail call double @atanh(double [[CONV]]) #[[ATTR4:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[IV]]
-; CHECK-NEXT: store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK-NOT: __atanhd2{{.*}}<2 x double>
+; CHECK: ret void
;
entry:
br label %for.body
define void @atanh_f32(float* nocapture %varray) {
; CHECK-LABEL: @atanh_f32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT: [[CALL:%.*]] = tail call float @atanhf(float [[CONV]]) #[[ATTR5:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[IV]]
-; CHECK-NEXT: store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK-NOT: __atanhf4{{.*}}<2 x double>
+; CHECK: ret void
;
entry:
br label %for.body
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -vector-library=MASSV -mtriple=powerpc64le-unknown-linux-gnu -inject-tli-mappings -loop-vectorize -force-vector-interleave=1 -S < %s | FileCheck %s
; RUN: opt -vector-library=MASSV -vec-extabi -mattr=+altivec -mtriple=powerpc64-ibm-aix-xcoff -inject-tli-mappings -loop-vectorize -force-vector-interleave=1 -S < %s | FileCheck %s
declare float @atanhf(float) #0
define void @cbrt_f64(double* nocapture %varray) {
+; CHECK-LABEL: @cbrt_f64(
+; CHECK: __cbrtd2{{.*}}<2 x double>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @cbrt_f32(float* nocapture %varray) {
+; CHECK-LABEL: @cbrt_f32(
+; CHECK: __cbrtf4{{.*}}<4 x float>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @pow_f64(double* nocapture %varray, double* nocapture readonly %exp) {
+; CHECK-LABEL: @pow_f64(
+; CHECK: __powd2{{.*}}<2 x double>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @pow_f64_intrinsic(double* nocapture %varray, double* nocapture readonly %exp) {
+; CHECK-LABEL: @pow_f64_intrinsic(
+; CHECK: __powd2{{.*}}<2 x double>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @pow_f32(float* nocapture %varray, float* nocapture readonly %exp) {
+; CHECK-LABEL: @pow_f32(
+; CHECK: __powf4{{.*}}<4 x float>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @pow_f32_intrinsic(float* nocapture %varray, float* nocapture readonly %exp) {
+; CHECK-LABEL: @pow_f32_intrinsic(
+; CHECK: __powf4{{.*}}<4 x float>
+; CHECK: ret void
+;
entry:
br label %for.body
define void @sqrt_f64(double* nocapture %varray) {
; CHECK-LABEL: @sqrt_f64(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT: [[CALL:%.*]] = tail call double @sqrt(double [[CONV]])
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[IV]]
-; CHECK-NEXT: store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK-NOT: __sqrtd2{{.*}}<2 x double>
+; CHECK: ret void
;
entry:
br label %for.body
define void @sqrt_f32(float* nocapture %varray) {
; CHECK-LABEL: @sqrt_f32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT: [[CALL:%.*]] = tail call float @sqrtf(float [[CONV]])
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[IV]]
-; CHECK-NEXT: store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK-NOT: __sqrtf4{{.*}}<4 x float>
+; CHECK: ret void
;
entry:
br label %for.body
}
define void @exp_f64(double* nocapture %varray) {
+; CHECK-LABEL: @exp_f64(
+; CHECK: __expd2{{.*}}<2 x double>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @exp_f64_intrinsic(double* nocapture %varray) {
+; CHECK-LABEL: @exp_f64_intrinsic(
+; CHECK: __expd2{{.*}}<2 x double>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @exp_f32(float* nocapture %varray) {
+; CHECK-LABEL: @exp_f32(
+; CHECK: __expf4{{.*}}<4 x float>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @exp_f32_intrinsic(float* nocapture %varray) {
+; CHECK-LABEL: @exp_f32_intrinsic(
+; CHECK: __expf4{{.*}}<4 x float>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @exp2_f64(double* nocapture %varray) {
+; CHECK-LABEL: @exp2_f64(
+; CHECK: __exp2d2{{.*}}<2 x double>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @exp2_f64_intrinsic(double* nocapture %varray) {
+; CHECK-LABEL: @exp2_f64_intrinsic(
+; CHECK: __exp2d2{{.*}}<2 x double>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @exp2_f32(float* nocapture %varray) {
+; CHECK-LABEL: @exp2_f32(
+; CHECK: __exp2f4{{.*}}<4 x float>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @exp2_f32_intrinsic(float* nocapture %varray) {
+; CHECK-LABEL: @exp2_f32_intrinsic(
+; CHECK: __exp2f4{{.*}}<4 x float>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @expm1_f64(double* nocapture %varray) {
+; CHECK-LABEL: @expm1_f64(
+; CHECK: __expm1d2{{.*}}<2 x double>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @expm1_f32(float* nocapture %varray) {
+; CHECK-LABEL: @expm1_f32(
+; CHECK: __expm1f4{{.*}}<4 x float>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @log_f64(double* nocapture %varray) {
+; CHECK-LABEL: @log_f64(
+; CHECK: __logd2{{.*}}<2 x double>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @log_f64_intrinsic(double* nocapture %varray) {
+; CHECK-LABEL: @log_f64_intrinsic(
+; CHECK: __logd2{{.*}}<2 x double>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @log_f32(float* nocapture %varray) {
+; CHECK-LABEL: @log_f32(
+; CHECK: __logf4{{.*}}<4 x float>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @log_f32_intrinsic(float* nocapture %varray) {
+; CHECK-LABEL: @log_f32_intrinsic(
+; CHECK: __logf4{{.*}}<4 x float>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @log1p_f64(double* nocapture %varray) {
+; CHECK-LABEL: @log1p_f64(
+; CHECK: __log1pd2{{.*}}<2 x double>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @log1p_f32(float* nocapture %varray) {
+; CHECK-LABEL: @log1p_f32(
+; CHECK: __log1pf4{{.*}}<4 x float>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @log10_f64(double* nocapture %varray) {
+; CHECK-LABEL: @log10_f64(
+; CHECK: __log10d2(<2 x double>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @log10_f64_intrinsic(double* nocapture %varray) {
+; CHECK-LABEL: @log10_f64_intrinsic(
+; CHECK: __log10d2{{.*}}<2 x double>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @log10_f32(float* nocapture %varray) {
+; CHECK-LABEL: @log10_f32(
+; CHECK: __log10f4{{.*}}<4 x float>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @log10_f32_intrinsic(float* nocapture %varray) {
+; CHECK-LABEL: @log10_f32_intrinsic(
+; CHECK: __log10f4{{.*}}<4 x float>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @log2_f64(double* nocapture %varray) {
+; CHECK-LABEL: @log2_f64(
+; CHECK: __log2d2(<2 x double>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @log2_f64_intrinsic(double* nocapture %varray) {
+; CHECK-LABEL: @log2_f64_intrinsic(
+; CHECK: __log2d2{{.*}}<2 x double>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @log2_f32(float* nocapture %varray) {
+; CHECK-LABEL: @log2_f32(
+; CHECK: __log2f4{{.*}}<4 x float>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @log2_f32_intrinsic(float* nocapture %varray) {
+; CHECK-LABEL: @log2_f32_intrinsic(
+; CHECK: __log2f4{{.*}}<4 x float>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @sin_f64(double* nocapture %varray) {
+; CHECK-LABEL: @sin_f64(
+; CHECK: __sind2{{.*}}<2 x double>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @sin_f64_intrinsic(double* nocapture %varray) {
+; CHECK-LABEL: @sin_f64_intrinsic(
+; CHECK: __sind2{{.*}}<2 x double>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @sin_f32(float* nocapture %varray) {
+; CHECK-LABEL: @sin_f32(
+; CHECK: __sinf4{{.*}}<4 x float>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @sin_f32_intrinsic(float* nocapture %varray) {
+; CHECK-LABEL: @sin_f32_intrinsic(
+; CHECK: __sinf4{{.*}}<4 x float>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @cos_f64(double* nocapture %varray) {
+; CHECK-LABEL: @cos_f64(
+; CHECK: __cosd2{{.*}}<2 x double>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @cos_f64_intrinsic(double* nocapture %varray) {
+; CHECK-LABEL: @cos_f64_intrinsic(
+; CHECK: [[TMP5:%.*]] = call <2 x double> @__cosd2(<2 x double> [[TMP4:%.*]])
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @cos_f32(float* nocapture %varray) {
+; CHECK-LABEL: @cos_f32(
+; CHECK: __cosf4{{.*}}<4 x float>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @cos_f32_intrinsic(float* nocapture %varray) {
+; CHECK-LABEL: @cos_f32_intrinsic(
+; CHECK: __cosf4{{.*}}<4 x float>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @tan_f64(double* nocapture %varray) {
+; CHECK-LABEL: @tan_f64(
+; CHECK: __tand2{{.*}}<2 x double>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @tan_f32(float* nocapture %varray) {
+; CHECK-LABEL: @tan_f32(
+; CHECK: __tanf4{{.*}}<4 x float>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @asin_f64(double* nocapture %varray) {
+; CHECK-LABEL: @asin_f64(
+; CHECK: __asind2{{.*}}<2 x double>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @asin_f32(float* nocapture %varray) {
+; CHECK-LABEL: @asin_f32(
+; CHECK: __asinf4{{.*}}<4 x float>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @acos_f64(double* nocapture %varray) {
+; CHECK-LABEL: @acos_f64(
+; CHECK: __acosd2{{.*}}<2 x double>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @acos_f32(float* nocapture %varray) {
+; CHECK-LABEL: @acos_f32(
+; CHECK: __acosf4{{.*}}<4 x float>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @atan_f64(double* nocapture %varray) {
+; CHECK-LABEL: @atan_f64(
+; CHECK: __atand2{{.*}}<2 x double>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @atan_f32(float* nocapture %varray) {
+; CHECK-LABEL: @atan_f32(
+; CHECK: __atanf4{{.*}}<4 x float>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @atan2_f64(double* nocapture %varray) {
+; CHECK-LABEL: @atan2_f64(
+; CHECK: __atan2d2{{.*}}<2 x double>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @atan2_f32(float* nocapture %varray) {
+; CHECK-LABEL: @atan2_f32(
+; CHECK: __atan2f4{{.*}}<4 x float>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @sinh_f64(double* nocapture %varray) {
+; CHECK-LABEL: @sinh_f64(
+; CHECK: __sinhd2{{.*}}<2 x double>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @sinh_f32(float* nocapture %varray) {
+; CHECK-LABEL: @sinh_f32(
+; CHECK: __sinhf4{{.*}}<4 x float>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @cosh_f64(double* nocapture %varray) {
+; CHECK-LABEL: @cosh_f64(
+; CHECK: __coshd2{{.*}}<2 x double>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @cosh_f32(float* nocapture %varray) {
+; CHECK-LABEL: @cosh_f32(
+; CHECK: __coshf4{{.*}}<4 x float>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @tanh_f64(double* nocapture %varray) {
+; CHECK-LABEL: @tanh_f64(
+; CHECK: __tanhd2{{.*}}<2 x double>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @tanh_f32(float* nocapture %varray) {
+; CHECK-LABEL: @tanh_f32(
+; CHECK: __tanhf4{{.*}}<4 x float>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @asinh_f64(double* nocapture %varray) {
+; CHECK-LABEL: @asinh_f64(
+; CHECK: __asinhd2{{.*}}<2 x double>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @asinh_f32(float* nocapture %varray) {
+; CHECK-LABEL: @asinh_f32(
+; CHECK: __asinhf4{{.*}}<4 x float>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @acosh_f64(double* nocapture %varray) {
+; CHECK-LABEL: @acosh_f64(
+; CHECK: __acoshd2{{.*}}<2 x double>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @acosh_f32(float* nocapture %varray) {
+; CHECK-LABEL: @acosh_f32(
+; CHECK: __acoshf4{{.*}}<4 x float>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @atanh_f64(double* nocapture %varray) {
+; CHECK-LABEL: @atanh_f64(
+; CHECK: __atanhd2{{.*}}<2 x double>
+; CHECK: ret void
+;
entry:
br label %for.body
}
define void @atanh_f32(float* nocapture %varray) {
+; CHECK-LABEL: @atanh_f32(
+; CHECK: __atanhf4{{.*}}<4 x float>
+; CHECK: ret void
+;
entry:
br label %for.body
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -vector-library=MASSV -inject-tli-mappings -loop-vectorize -force-vector-interleave=1 -S < %s | FileCheck %s
-target datalayout = "e-m:e-i64:64-n32:64"
+target datalayout = "e-m:e-i64:64-n32:64"
target triple = "powerpc64le-unknown-linux-gnu"
declare double @atanh(double) #1
; Check that functions marked as nobuiltin are not lowered to massv entries.
define void @atanh_f64(double* nocapture %varray) {
; CHECK-LABEL: @atanh_f64(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT: [[CALL:%.*]] = tail call double @atanh(double [[CONV]])
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[IV]]
-; CHECK-NEXT: store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK-NOT: __atanhd2{{.*}}<2 x double>
+; CHECK: ret void
;
entry:
br label %for.body
define void @atanh_f32(float* nocapture %varray) {
; CHECK-LABEL: @atanh_f32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT: [[CALL:%.*]] = tail call float @atanhf(float [[CONV]])
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[IV]]
-; CHECK-NEXT: store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK-NOT: __atanhf4{{.*}}<2 x double>
+; CHECK: ret void
;
entry:
br label %for.body
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -vector-library=MASSV -inject-tli-mappings -loop-vectorize -force-vector-interleave=1 -S < %s | FileCheck %s
-target datalayout = "e-m:e-i64:64-n32:64"
+target datalayout = "e-m:e-i64:64-n32:64"
target triple = "powerpc64le-unknown-linux-gnu"
declare double @ceil(double) #0
; Vector counterpart of ceil is unsupported in MASSV library.
define void @ceil_f64(double* nocapture %varray) {
; CHECK-LABEL: @ceil_f64(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT: [[CALL:%.*]] = tail call double @ceil(double [[CONV]])
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[IV]]
-; CHECK-NEXT: store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK-NOT: __ceild2_massv{{.*}}<2 x double>
+; CHECK-NOT: __ceild2_P8{{.*}}<2 x double>
+; CHECK-NOT: __ceild2{{.*}}<2 x double>
+; CHECK: ret void
;
entry:
br label %for.body
; Vector counterpart of fabs is unsupported in MASSV library.
define void @fabs_f32(float* nocapture %varray) {
; CHECK-LABEL: @fabs_f32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT: [[CALL:%.*]] = tail call float @fabsf(float [[CONV]])
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[IV]]
-; CHECK-NEXT: store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK-NOT: __fabsf4_massv{{.*}}<4 x float>
+; CHECK-NOT: __fabsf4_P8{{.*}}<4 x float>
+; CHECK-NOT: __fabsf4{{.*}}<4 x float>
+; CHECK: ret void
;
entry:
br label %for.body
; They are not lowered to MASSV entries.
define void @sqrt_f64_intrinsic(double* nocapture %varray) {
; CHECK-LABEL: @sqrt_f64_intrinsic(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sitofp <2 x i32> [[VEC_IND]] to <2 x double>
-; CHECK-NEXT: [[TMP2:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP1]])
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP4]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP2]], <2 x double>* [[TMP5]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT: [[CALL:%.*]] = tail call double @llvm.sqrt.f64(double [[CONV]])
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT: store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK: llvm.sqrt.v2f64{{.*}}<2 x double>
+; CHECK: ret void
;
entry:
br label %for.body
define void @sqrt_f32_intrinsic(float* nocapture %varray) {
; CHECK-LABEL: @sqrt_f32_intrinsic(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[TMP1]])
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT: [[CALL:%.*]] = tail call float @llvm.sqrt.f32(float [[CONV]])
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT: store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK: llvm.sqrt.v4f32{{.*}}<4 x float>
+; CHECK: ret void
;
entry:
br label %for.body
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -passes='loop-vectorize' -enable-epilogue-vectorization -epilogue-vectorization-force-VF=2 -S | FileCheck %s --check-prefix VF-TWO-CHECK
; RUN: opt < %s -passes='loop-vectorize' -enable-epilogue-vectorization -epilogue-vectorization-force-VF=4 -S | FileCheck %s --check-prefix VF-FOUR-CHECK
; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP95]], <4 x float>* [[TMP131]], align 4
; VF-TWO-CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 48
; VF-TWO-CHECK-NEXT: [[TMP132:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; VF-TWO-CHECK-NEXT: br i1 [[TMP132]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF-TWO-CHECK-NEXT: br i1 [[TMP132]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOPID_MV:![0-9]+]]
; VF-TWO-CHECK: middle.block:
; VF-TWO-CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; VF-TWO-CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; VF-TWO-CHECK-NEXT: [[N_VEC26:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF25]]
; VF-TWO-CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; VF-TWO-CHECK: vec.epilog.vector.body:
-; VF-TWO-CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT31:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; VF-TWO-CHECK-NEXT: [[TMP133:%.*]] = add i64 [[OFFSET_IDX]], 0
+; VF-TWO-CHECK-NEXT: [[INDEX27:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT28:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; VF-TWO-CHECK-NEXT: [[TMP133:%.*]] = add i64 [[INDEX27]], 0
; VF-TWO-CHECK-NEXT: [[TMP134:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP133]]
; VF-TWO-CHECK-NEXT: [[TMP135:%.*]] = getelementptr inbounds float, float* [[TMP134]], i32 0
; VF-TWO-CHECK-NEXT: [[TMP136:%.*]] = bitcast float* [[TMP135]] to <2 x float>*
-; VF-TWO-CHECK-NEXT: [[WIDE_LOAD29:%.*]] = load <2 x float>, <2 x float>* [[TMP136]], align 4
+; VF-TWO-CHECK-NEXT: [[WIDE_LOAD30:%.*]] = load <2 x float>, <2 x float>* [[TMP136]], align 4
; VF-TWO-CHECK-NEXT: [[TMP137:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP133]]
; VF-TWO-CHECK-NEXT: [[TMP138:%.*]] = getelementptr inbounds float, float* [[TMP137]], i32 0
; VF-TWO-CHECK-NEXT: [[TMP139:%.*]] = bitcast float* [[TMP138]] to <2 x float>*
-; VF-TWO-CHECK-NEXT: [[WIDE_LOAD30:%.*]] = load <2 x float>, <2 x float>* [[TMP139]], align 4
-; VF-TWO-CHECK-NEXT: [[TMP140:%.*]] = fadd fast <2 x float> [[WIDE_LOAD29]], [[WIDE_LOAD30]]
+; VF-TWO-CHECK-NEXT: [[WIDE_LOAD31:%.*]] = load <2 x float>, <2 x float>* [[TMP139]], align 4
+; VF-TWO-CHECK-NEXT: [[TMP140:%.*]] = fadd fast <2 x float> [[WIDE_LOAD30]], [[WIDE_LOAD31]]
; VF-TWO-CHECK-NEXT: [[TMP141:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP133]]
; VF-TWO-CHECK-NEXT: [[TMP142:%.*]] = getelementptr inbounds float, float* [[TMP141]], i32 0
; VF-TWO-CHECK-NEXT: [[TMP143:%.*]] = bitcast float* [[TMP142]] to <2 x float>*
; VF-TWO-CHECK-NEXT: store <2 x float> [[TMP140]], <2 x float>* [[TMP143]], align 4
-; VF-TWO-CHECK-NEXT: [[INDEX_NEXT31]] = add nuw i64 [[OFFSET_IDX]], 2
-; VF-TWO-CHECK-NEXT: [[TMP144:%.*]] = icmp eq i64 [[INDEX_NEXT31]], [[N_VEC26]]
-; VF-TWO-CHECK-NEXT: br i1 [[TMP144]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; VF-TWO-CHECK-NEXT: [[INDEX_NEXT28]] = add nuw i64 [[INDEX27]], 2
+; VF-TWO-CHECK-NEXT: [[TMP144:%.*]] = icmp eq i64 [[INDEX_NEXT28]], [[N_VEC26]]
+; VF-TWO-CHECK-NEXT: br i1 [[TMP144]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOPID_EV:![0-9]+]]
; VF-TWO-CHECK: vec.epilog.middle.block:
-; VF-TWO-CHECK-NEXT: [[CMP_N27:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC26]]
-; VF-TWO-CHECK-NEXT: br i1 [[CMP_N27]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
+; VF-TWO-CHECK-NEXT: [[CMP_N29:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC26]]
+; VF-TWO-CHECK-NEXT: br i1 [[CMP_N29]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
; VF-TWO-CHECK: vec.epilog.scalar.ph:
; VF-TWO-CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC26]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ]
; VF-TWO-CHECK-NEXT: br label [[FOR_BODY:%.*]]
; VF-FOUR-CHECK-NEXT: [[N_VEC26:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF25]]
; VF-FOUR-CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; VF-FOUR-CHECK: vec.epilog.vector.body:
-; VF-FOUR-CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT31:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; VF-FOUR-CHECK-NEXT: [[TMP133:%.*]] = add i64 [[OFFSET_IDX]], 0
+; VF-FOUR-CHECK-NEXT: [[INDEX27:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT28:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; VF-FOUR-CHECK-NEXT: [[TMP133:%.*]] = add i64 [[INDEX27]], 0
; VF-FOUR-CHECK-NEXT: [[TMP134:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP133]]
; VF-FOUR-CHECK-NEXT: [[TMP135:%.*]] = getelementptr inbounds float, float* [[TMP134]], i32 0
; VF-FOUR-CHECK-NEXT: [[TMP136:%.*]] = bitcast float* [[TMP135]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD29:%.*]] = load <4 x float>, <4 x float>* [[TMP136]], align 4
+; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD30:%.*]] = load <4 x float>, <4 x float>* [[TMP136]], align 4
; VF-FOUR-CHECK-NEXT: [[TMP137:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP133]]
; VF-FOUR-CHECK-NEXT: [[TMP138:%.*]] = getelementptr inbounds float, float* [[TMP137]], i32 0
; VF-FOUR-CHECK-NEXT: [[TMP139:%.*]] = bitcast float* [[TMP138]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD30:%.*]] = load <4 x float>, <4 x float>* [[TMP139]], align 4
-; VF-FOUR-CHECK-NEXT: [[TMP140:%.*]] = fadd fast <4 x float> [[WIDE_LOAD29]], [[WIDE_LOAD30]]
+; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD31:%.*]] = load <4 x float>, <4 x float>* [[TMP139]], align 4
+; VF-FOUR-CHECK-NEXT: [[TMP140:%.*]] = fadd fast <4 x float> [[WIDE_LOAD30]], [[WIDE_LOAD31]]
; VF-FOUR-CHECK-NEXT: [[TMP141:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP133]]
; VF-FOUR-CHECK-NEXT: [[TMP142:%.*]] = getelementptr inbounds float, float* [[TMP141]], i32 0
; VF-FOUR-CHECK-NEXT: [[TMP143:%.*]] = bitcast float* [[TMP142]] to <4 x float>*
; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP140]], <4 x float>* [[TMP143]], align 4
-; VF-FOUR-CHECK-NEXT: [[INDEX_NEXT31]] = add nuw i64 [[OFFSET_IDX]], 4
-; VF-FOUR-CHECK-NEXT: [[TMP144:%.*]] = icmp eq i64 [[INDEX_NEXT31]], [[N_VEC26]]
+; VF-FOUR-CHECK-NEXT: [[INDEX_NEXT28]] = add nuw i64 [[INDEX27]], 4
+; VF-FOUR-CHECK-NEXT: [[TMP144:%.*]] = icmp eq i64 [[INDEX_NEXT28]], [[N_VEC26]]
; VF-FOUR-CHECK-NEXT: br i1 [[TMP144]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
; VF-FOUR-CHECK: vec.epilog.middle.block:
-; VF-FOUR-CHECK-NEXT: [[CMP_N27:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC26]]
-; VF-FOUR-CHECK-NEXT: br i1 [[CMP_N27]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
+; VF-FOUR-CHECK-NEXT: [[CMP_N29:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC26]]
+; VF-FOUR-CHECK-NEXT: br i1 [[CMP_N29]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
; VF-FOUR-CHECK: vec.epilog.scalar.ph:
; VF-FOUR-CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC26]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ]
; VF-FOUR-CHECK-NEXT: br label [[FOR_BODY:%.*]]
; VF-TWO-CHECK-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP2]])
; VF-TWO-CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
; VF-TWO-CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
-; VF-TWO-CHECK-NEXT: [[TMP3:%.*]] = sub i32 [[TMP0]], [[MUL_RESULT]]
-; VF-TWO-CHECK-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP3]], [[TMP0]]
-; VF-TWO-CHECK-NEXT: [[TMP5:%.*]] = or i1 [[TMP4]], [[MUL_OVERFLOW]]
-; VF-TWO-CHECK-NEXT: [[TMP6:%.*]] = icmp ugt i64 [[TMP1]], 4294967295
-; VF-TWO-CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
-; VF-TWO-CHECK-NEXT: br i1 [[TMP7]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; VF-TWO-CHECK-NEXT: [[TMP4:%.*]] = sub i32 [[TMP0]], [[MUL_RESULT]]
+; VF-TWO-CHECK-NEXT: [[TMP5:%.*]] = icmp sgt i32 [[TMP4]], [[TMP0]]
+; VF-TWO-CHECK-NEXT: [[TMP9:%.*]] = or i1 [[TMP5]], [[MUL_OVERFLOW]]
+; VF-TWO-CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[TMP1]], 4294967295
+; VF-TWO-CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP9]], [[TMP8]]
+; VF-TWO-CHECK-NEXT: br i1 [[TMP10]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
; VF-TWO-CHECK: vector.main.loop.iter.check:
; VF-TWO-CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 32
; VF-TWO-CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
; VF-TWO-CHECK: vector.body:
; VF-TWO-CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; VF-TWO-CHECK-NEXT: [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32
-; VF-TWO-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[OFFSET_IDX]], 0
-; VF-TWO-CHECK-NEXT: [[TMP9:%.*]] = add i32 [[OFFSET_IDX]], 4
-; VF-TWO-CHECK-NEXT: [[TMP10:%.*]] = add i32 [[OFFSET_IDX]], 8
-; VF-TWO-CHECK-NEXT: [[TMP11:%.*]] = add i32 [[OFFSET_IDX]], 12
-; VF-TWO-CHECK-NEXT: [[TMP12:%.*]] = add i32 [[OFFSET_IDX]], 16
-; VF-TWO-CHECK-NEXT: [[TMP13:%.*]] = add i32 [[OFFSET_IDX]], 20
-; VF-TWO-CHECK-NEXT: [[TMP14:%.*]] = add i32 [[OFFSET_IDX]], 24
-; VF-TWO-CHECK-NEXT: [[TMP15:%.*]] = add i32 [[OFFSET_IDX]], 28
-; VF-TWO-CHECK-NEXT: [[TMP16:%.*]] = add i64 [[INDEX]], 0
-; VF-TWO-CHECK-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 4
-; VF-TWO-CHECK-NEXT: [[TMP18:%.*]] = add i64 [[INDEX]], 8
-; VF-TWO-CHECK-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], 12
-; VF-TWO-CHECK-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 16
-; VF-TWO-CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 20
-; VF-TWO-CHECK-NEXT: [[TMP22:%.*]] = add i64 [[INDEX]], 24
-; VF-TWO-CHECK-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], 28
-; VF-TWO-CHECK-NEXT: [[TMP24:%.*]] = xor i32 [[TMP8]], -1
-; VF-TWO-CHECK-NEXT: [[TMP25:%.*]] = xor i32 [[TMP9]], -1
-; VF-TWO-CHECK-NEXT: [[TMP26:%.*]] = xor i32 [[TMP10]], -1
-; VF-TWO-CHECK-NEXT: [[TMP27:%.*]] = xor i32 [[TMP11]], -1
-; VF-TWO-CHECK-NEXT: [[TMP28:%.*]] = xor i32 [[TMP12]], -1
-; VF-TWO-CHECK-NEXT: [[TMP29:%.*]] = xor i32 [[TMP13]], -1
-; VF-TWO-CHECK-NEXT: [[TMP30:%.*]] = xor i32 [[TMP14]], -1
-; VF-TWO-CHECK-NEXT: [[TMP31:%.*]] = xor i32 [[TMP15]], -1
-; VF-TWO-CHECK-NEXT: [[TMP32:%.*]] = add i32 [[TMP24]], [[N]]
-; VF-TWO-CHECK-NEXT: [[TMP33:%.*]] = add i32 [[TMP25]], [[N]]
-; VF-TWO-CHECK-NEXT: [[TMP34:%.*]] = add i32 [[TMP26]], [[N]]
-; VF-TWO-CHECK-NEXT: [[TMP35:%.*]] = add i32 [[TMP27]], [[N]]
+; VF-TWO-CHECK-NEXT: [[TMP20:%.*]] = add i32 [[OFFSET_IDX]], 0
+; VF-TWO-CHECK-NEXT: [[TMP21:%.*]] = add i32 [[OFFSET_IDX]], 4
+; VF-TWO-CHECK-NEXT: [[TMP22:%.*]] = add i32 [[OFFSET_IDX]], 8
+; VF-TWO-CHECK-NEXT: [[TMP23:%.*]] = add i32 [[OFFSET_IDX]], 12
+; VF-TWO-CHECK-NEXT: [[TMP24:%.*]] = add i32 [[OFFSET_IDX]], 16
+; VF-TWO-CHECK-NEXT: [[TMP25:%.*]] = add i32 [[OFFSET_IDX]], 20
+; VF-TWO-CHECK-NEXT: [[TMP26:%.*]] = add i32 [[OFFSET_IDX]], 24
+; VF-TWO-CHECK-NEXT: [[TMP27:%.*]] = add i32 [[OFFSET_IDX]], 28
+; VF-TWO-CHECK-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; VF-TWO-CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 4
+; VF-TWO-CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 8
+; VF-TWO-CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 12
+; VF-TWO-CHECK-NEXT: [[TMP16:%.*]] = add i64 [[INDEX]], 16
+; VF-TWO-CHECK-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 20
+; VF-TWO-CHECK-NEXT: [[TMP18:%.*]] = add i64 [[INDEX]], 24
+; VF-TWO-CHECK-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], 28
+; VF-TWO-CHECK-NEXT: [[TMP28:%.*]] = xor i32 [[TMP20]], -1
+; VF-TWO-CHECK-NEXT: [[TMP29:%.*]] = xor i32 [[TMP21]], -1
+; VF-TWO-CHECK-NEXT: [[TMP30:%.*]] = xor i32 [[TMP22]], -1
+; VF-TWO-CHECK-NEXT: [[TMP31:%.*]] = xor i32 [[TMP23]], -1
+; VF-TWO-CHECK-NEXT: [[TMP32:%.*]] = xor i32 [[TMP24]], -1
+; VF-TWO-CHECK-NEXT: [[TMP33:%.*]] = xor i32 [[TMP25]], -1
+; VF-TWO-CHECK-NEXT: [[TMP34:%.*]] = xor i32 [[TMP26]], -1
+; VF-TWO-CHECK-NEXT: [[TMP35:%.*]] = xor i32 [[TMP27]], -1
; VF-TWO-CHECK-NEXT: [[TMP36:%.*]] = add i32 [[TMP28]], [[N]]
; VF-TWO-CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP29]], [[N]]
; VF-TWO-CHECK-NEXT: [[TMP38:%.*]] = add i32 [[TMP30]], [[N]]
; VF-TWO-CHECK-NEXT: [[TMP39:%.*]] = add i32 [[TMP31]], [[N]]
-; VF-TWO-CHECK-NEXT: [[TMP40:%.*]] = sext i32 [[TMP32]] to i64
-; VF-TWO-CHECK-NEXT: [[TMP41:%.*]] = sext i32 [[TMP33]] to i64
-; VF-TWO-CHECK-NEXT: [[TMP42:%.*]] = sext i32 [[TMP34]] to i64
-; VF-TWO-CHECK-NEXT: [[TMP43:%.*]] = sext i32 [[TMP35]] to i64
+; VF-TWO-CHECK-NEXT: [[TMP40:%.*]] = add i32 [[TMP32]], [[N]]
+; VF-TWO-CHECK-NEXT: [[TMP41:%.*]] = add i32 [[TMP33]], [[N]]
+; VF-TWO-CHECK-NEXT: [[TMP42:%.*]] = add i32 [[TMP34]], [[N]]
+; VF-TWO-CHECK-NEXT: [[TMP43:%.*]] = add i32 [[TMP35]], [[N]]
; VF-TWO-CHECK-NEXT: [[TMP44:%.*]] = sext i32 [[TMP36]] to i64
; VF-TWO-CHECK-NEXT: [[TMP45:%.*]] = sext i32 [[TMP37]] to i64
; VF-TWO-CHECK-NEXT: [[TMP46:%.*]] = sext i32 [[TMP38]] to i64
; VF-TWO-CHECK-NEXT: [[TMP47:%.*]] = sext i32 [[TMP39]] to i64
-; VF-TWO-CHECK-NEXT: [[TMP48:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP40]]
-; VF-TWO-CHECK-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP41]]
-; VF-TWO-CHECK-NEXT: [[TMP50:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP42]]
-; VF-TWO-CHECK-NEXT: [[TMP51:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP43]]
-; VF-TWO-CHECK-NEXT: [[TMP52:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP44]]
+; VF-TWO-CHECK-NEXT: [[TMP48:%.*]] = sext i32 [[TMP40]] to i64
+; VF-TWO-CHECK-NEXT: [[TMP49:%.*]] = sext i32 [[TMP41]] to i64
+; VF-TWO-CHECK-NEXT: [[TMP50:%.*]] = sext i32 [[TMP42]] to i64
+; VF-TWO-CHECK-NEXT: [[TMP51:%.*]] = sext i32 [[TMP43]] to i64
+; VF-TWO-CHECK-NEXT: [[TMP52:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP44]]
; VF-TWO-CHECK-NEXT: [[TMP53:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP45]]
; VF-TWO-CHECK-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP46]]
; VF-TWO-CHECK-NEXT: [[TMP55:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP47]]
-; VF-TWO-CHECK-NEXT: [[TMP56:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 0
-; VF-TWO-CHECK-NEXT: [[TMP57:%.*]] = getelementptr inbounds float, float* [[TMP56]], i32 -3
-; VF-TWO-CHECK-NEXT: [[TMP58:%.*]] = bitcast float* [[TMP57]] to <4 x float>*
-; VF-TWO-CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP58]], align 4
+; VF-TWO-CHECK-NEXT: [[TMP56:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP48]]
+; VF-TWO-CHECK-NEXT: [[TMP57:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP49]]
+; VF-TWO-CHECK-NEXT: [[TMP58:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP50]]
+; VF-TWO-CHECK-NEXT: [[TMP59:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP51]]
+; VF-TWO-CHECK-NEXT: [[TMP60:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 0
+; VF-TWO-CHECK-NEXT: [[TMP61:%.*]] = getelementptr inbounds float, float* [[TMP60]], i32 -3
+; VF-TWO-CHECK-NEXT: [[TMP62:%.*]] = bitcast float* [[TMP61]] to <4 x float>*
+; VF-TWO-CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP62]], align 4
; VF-TWO-CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x float> [[WIDE_LOAD]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-TWO-CHECK-NEXT: [[TMP59:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -4
-; VF-TWO-CHECK-NEXT: [[TMP60:%.*]] = getelementptr inbounds float, float* [[TMP59]], i32 -3
-; VF-TWO-CHECK-NEXT: [[TMP61:%.*]] = bitcast float* [[TMP60]] to <4 x float>*
-; VF-TWO-CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, <4 x float>* [[TMP61]], align 4
+; VF-TWO-CHECK-NEXT: [[TMP63:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -4
+; VF-TWO-CHECK-NEXT: [[TMP64:%.*]] = getelementptr inbounds float, float* [[TMP63]], i32 -3
+; VF-TWO-CHECK-NEXT: [[TMP65:%.*]] = bitcast float* [[TMP64]] to <4 x float>*
+; VF-TWO-CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, <4 x float>* [[TMP65]], align 4
; VF-TWO-CHECK-NEXT: [[REVERSE3:%.*]] = shufflevector <4 x float> [[WIDE_LOAD2]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-TWO-CHECK-NEXT: [[TMP62:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -8
-; VF-TWO-CHECK-NEXT: [[TMP63:%.*]] = getelementptr inbounds float, float* [[TMP62]], i32 -3
-; VF-TWO-CHECK-NEXT: [[TMP64:%.*]] = bitcast float* [[TMP63]] to <4 x float>*
-; VF-TWO-CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x float>, <4 x float>* [[TMP64]], align 4
+; VF-TWO-CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -8
+; VF-TWO-CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds float, float* [[TMP66]], i32 -3
+; VF-TWO-CHECK-NEXT: [[TMP68:%.*]] = bitcast float* [[TMP67]] to <4 x float>*
+; VF-TWO-CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x float>, <4 x float>* [[TMP68]], align 4
; VF-TWO-CHECK-NEXT: [[REVERSE5:%.*]] = shufflevector <4 x float> [[WIDE_LOAD4]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-TWO-CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -12
-; VF-TWO-CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds float, float* [[TMP65]], i32 -3
-; VF-TWO-CHECK-NEXT: [[TMP67:%.*]] = bitcast float* [[TMP66]] to <4 x float>*
-; VF-TWO-CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x float>, <4 x float>* [[TMP67]], align 4
+; VF-TWO-CHECK-NEXT: [[TMP69:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -12
+; VF-TWO-CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds float, float* [[TMP69]], i32 -3
+; VF-TWO-CHECK-NEXT: [[TMP71:%.*]] = bitcast float* [[TMP70]] to <4 x float>*
+; VF-TWO-CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x float>, <4 x float>* [[TMP71]], align 4
; VF-TWO-CHECK-NEXT: [[REVERSE7:%.*]] = shufflevector <4 x float> [[WIDE_LOAD6]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-TWO-CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -16
-; VF-TWO-CHECK-NEXT: [[TMP69:%.*]] = getelementptr inbounds float, float* [[TMP68]], i32 -3
-; VF-TWO-CHECK-NEXT: [[TMP70:%.*]] = bitcast float* [[TMP69]] to <4 x float>*
-; VF-TWO-CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x float>, <4 x float>* [[TMP70]], align 4
+; VF-TWO-CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -16
+; VF-TWO-CHECK-NEXT: [[TMP73:%.*]] = getelementptr inbounds float, float* [[TMP72]], i32 -3
+; VF-TWO-CHECK-NEXT: [[TMP74:%.*]] = bitcast float* [[TMP73]] to <4 x float>*
+; VF-TWO-CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x float>, <4 x float>* [[TMP74]], align 4
; VF-TWO-CHECK-NEXT: [[REVERSE9:%.*]] = shufflevector <4 x float> [[WIDE_LOAD8]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-TWO-CHECK-NEXT: [[TMP71:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -20
-; VF-TWO-CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds float, float* [[TMP71]], i32 -3
-; VF-TWO-CHECK-NEXT: [[TMP73:%.*]] = bitcast float* [[TMP72]] to <4 x float>*
-; VF-TWO-CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x float>, <4 x float>* [[TMP73]], align 4
+; VF-TWO-CHECK-NEXT: [[TMP75:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -20
+; VF-TWO-CHECK-NEXT: [[TMP76:%.*]] = getelementptr inbounds float, float* [[TMP75]], i32 -3
+; VF-TWO-CHECK-NEXT: [[TMP77:%.*]] = bitcast float* [[TMP76]] to <4 x float>*
+; VF-TWO-CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x float>, <4 x float>* [[TMP77]], align 4
; VF-TWO-CHECK-NEXT: [[REVERSE11:%.*]] = shufflevector <4 x float> [[WIDE_LOAD10]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-TWO-CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -24
-; VF-TWO-CHECK-NEXT: [[TMP75:%.*]] = getelementptr inbounds float, float* [[TMP74]], i32 -3
-; VF-TWO-CHECK-NEXT: [[TMP76:%.*]] = bitcast float* [[TMP75]] to <4 x float>*
-; VF-TWO-CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, <4 x float>* [[TMP76]], align 4
+; VF-TWO-CHECK-NEXT: [[TMP78:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -24
+; VF-TWO-CHECK-NEXT: [[TMP79:%.*]] = getelementptr inbounds float, float* [[TMP78]], i32 -3
+; VF-TWO-CHECK-NEXT: [[TMP80:%.*]] = bitcast float* [[TMP79]] to <4 x float>*
+; VF-TWO-CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, <4 x float>* [[TMP80]], align 4
; VF-TWO-CHECK-NEXT: [[REVERSE13:%.*]] = shufflevector <4 x float> [[WIDE_LOAD12]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-TWO-CHECK-NEXT: [[TMP77:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -28
-; VF-TWO-CHECK-NEXT: [[TMP78:%.*]] = getelementptr inbounds float, float* [[TMP77]], i32 -3
-; VF-TWO-CHECK-NEXT: [[TMP79:%.*]] = bitcast float* [[TMP78]] to <4 x float>*
-; VF-TWO-CHECK-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x float>, <4 x float>* [[TMP79]], align 4
+; VF-TWO-CHECK-NEXT: [[TMP81:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -28
+; VF-TWO-CHECK-NEXT: [[TMP82:%.*]] = getelementptr inbounds float, float* [[TMP81]], i32 -3
+; VF-TWO-CHECK-NEXT: [[TMP83:%.*]] = bitcast float* [[TMP82]] to <4 x float>*
+; VF-TWO-CHECK-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x float>, <4 x float>* [[TMP83]], align 4
; VF-TWO-CHECK-NEXT: [[REVERSE15:%.*]] = shufflevector <4 x float> [[WIDE_LOAD14]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-TWO-CHECK-NEXT: [[TMP80:%.*]] = fadd fast <4 x float> [[REVERSE]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-TWO-CHECK-NEXT: [[TMP81:%.*]] = fadd fast <4 x float> [[REVERSE3]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-TWO-CHECK-NEXT: [[TMP82:%.*]] = fadd fast <4 x float> [[REVERSE5]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-TWO-CHECK-NEXT: [[TMP83:%.*]] = fadd fast <4 x float> [[REVERSE7]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-TWO-CHECK-NEXT: [[TMP84:%.*]] = fadd fast <4 x float> [[REVERSE9]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-TWO-CHECK-NEXT: [[TMP85:%.*]] = fadd fast <4 x float> [[REVERSE11]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-TWO-CHECK-NEXT: [[TMP86:%.*]] = fadd fast <4 x float> [[REVERSE13]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-TWO-CHECK-NEXT: [[TMP87:%.*]] = fadd fast <4 x float> [[REVERSE15]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-TWO-CHECK-NEXT: [[TMP88:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP16]]
-; VF-TWO-CHECK-NEXT: [[TMP89:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP17]]
-; VF-TWO-CHECK-NEXT: [[TMP90:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP18]]
-; VF-TWO-CHECK-NEXT: [[TMP91:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP19]]
-; VF-TWO-CHECK-NEXT: [[TMP92:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP20]]
-; VF-TWO-CHECK-NEXT: [[TMP93:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP21]]
-; VF-TWO-CHECK-NEXT: [[TMP94:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP22]]
-; VF-TWO-CHECK-NEXT: [[TMP95:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP23]]
-; VF-TWO-CHECK-NEXT: [[TMP96:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 0
-; VF-TWO-CHECK-NEXT: [[TMP97:%.*]] = bitcast float* [[TMP96]] to <4 x float>*
-; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP80]], <4 x float>* [[TMP97]], align 4
-; VF-TWO-CHECK-NEXT: [[TMP98:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 4
-; VF-TWO-CHECK-NEXT: [[TMP99:%.*]] = bitcast float* [[TMP98]] to <4 x float>*
-; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP81]], <4 x float>* [[TMP99]], align 4
-; VF-TWO-CHECK-NEXT: [[TMP100:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 8
+; VF-TWO-CHECK-NEXT: [[TMP84:%.*]] = fadd fast <4 x float> [[REVERSE]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-TWO-CHECK-NEXT: [[TMP85:%.*]] = fadd fast <4 x float> [[REVERSE3]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-TWO-CHECK-NEXT: [[TMP86:%.*]] = fadd fast <4 x float> [[REVERSE5]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-TWO-CHECK-NEXT: [[TMP87:%.*]] = fadd fast <4 x float> [[REVERSE7]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-TWO-CHECK-NEXT: [[TMP88:%.*]] = fadd fast <4 x float> [[REVERSE9]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-TWO-CHECK-NEXT: [[TMP89:%.*]] = fadd fast <4 x float> [[REVERSE11]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-TWO-CHECK-NEXT: [[TMP90:%.*]] = fadd fast <4 x float> [[REVERSE13]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-TWO-CHECK-NEXT: [[TMP91:%.*]] = fadd fast <4 x float> [[REVERSE15]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-TWO-CHECK-NEXT: [[TMP92:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP12]]
+; VF-TWO-CHECK-NEXT: [[TMP93:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP13]]
+; VF-TWO-CHECK-NEXT: [[TMP94:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP14]]
+; VF-TWO-CHECK-NEXT: [[TMP95:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP15]]
+; VF-TWO-CHECK-NEXT: [[TMP96:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP16]]
+; VF-TWO-CHECK-NEXT: [[TMP97:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP17]]
+; VF-TWO-CHECK-NEXT: [[TMP98:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP18]]
+; VF-TWO-CHECK-NEXT: [[TMP99:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP19]]
+; VF-TWO-CHECK-NEXT: [[TMP100:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 0
; VF-TWO-CHECK-NEXT: [[TMP101:%.*]] = bitcast float* [[TMP100]] to <4 x float>*
-; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP82]], <4 x float>* [[TMP101]], align 4
-; VF-TWO-CHECK-NEXT: [[TMP102:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 12
+; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP84]], <4 x float>* [[TMP101]], align 4
+; VF-TWO-CHECK-NEXT: [[TMP102:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 4
; VF-TWO-CHECK-NEXT: [[TMP103:%.*]] = bitcast float* [[TMP102]] to <4 x float>*
-; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP83]], <4 x float>* [[TMP103]], align 4
-; VF-TWO-CHECK-NEXT: [[TMP104:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 16
+; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP85]], <4 x float>* [[TMP103]], align 4
+; VF-TWO-CHECK-NEXT: [[TMP104:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 8
; VF-TWO-CHECK-NEXT: [[TMP105:%.*]] = bitcast float* [[TMP104]] to <4 x float>*
-; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP84]], <4 x float>* [[TMP105]], align 4
-; VF-TWO-CHECK-NEXT: [[TMP106:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 20
+; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP86]], <4 x float>* [[TMP105]], align 4
+; VF-TWO-CHECK-NEXT: [[TMP106:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 12
; VF-TWO-CHECK-NEXT: [[TMP107:%.*]] = bitcast float* [[TMP106]] to <4 x float>*
-; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP85]], <4 x float>* [[TMP107]], align 4
-; VF-TWO-CHECK-NEXT: [[TMP108:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 24
+; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP87]], <4 x float>* [[TMP107]], align 4
+; VF-TWO-CHECK-NEXT: [[TMP108:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 16
; VF-TWO-CHECK-NEXT: [[TMP109:%.*]] = bitcast float* [[TMP108]] to <4 x float>*
-; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP86]], <4 x float>* [[TMP109]], align 4
-; VF-TWO-CHECK-NEXT: [[TMP110:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 28
+; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP88]], <4 x float>* [[TMP109]], align 4
+; VF-TWO-CHECK-NEXT: [[TMP110:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 20
; VF-TWO-CHECK-NEXT: [[TMP111:%.*]] = bitcast float* [[TMP110]] to <4 x float>*
-; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP87]], <4 x float>* [[TMP111]], align 4
+; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP89]], <4 x float>* [[TMP111]], align 4
+; VF-TWO-CHECK-NEXT: [[TMP112:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 24
+; VF-TWO-CHECK-NEXT: [[TMP113:%.*]] = bitcast float* [[TMP112]] to <4 x float>*
+; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP90]], <4 x float>* [[TMP113]], align 4
+; VF-TWO-CHECK-NEXT: [[TMP114:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 28
+; VF-TWO-CHECK-NEXT: [[TMP115:%.*]] = bitcast float* [[TMP114]] to <4 x float>*
+; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP91]], <4 x float>* [[TMP115]], align 4
; VF-TWO-CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; VF-TWO-CHECK-NEXT: [[TMP112:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; VF-TWO-CHECK-NEXT: br i1 [[TMP112]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF-TWO-CHECK-NEXT: [[TMP116:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF-TWO-CHECK-NEXT: br i1 [[TMP116]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; VF-TWO-CHECK: middle.block:
; VF-TWO-CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; VF-TWO-CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; VF-TWO-CHECK: vec.epilog.iter.check:
-; VF-TWO-CHECK-NEXT: [[IND_END19:%.*]] = trunc i64 [[N_VEC]] to i32
+; VF-TWO-CHECK-NEXT: [[IND_END21:%.*]] = trunc i64 [[N_VEC]] to i32
; VF-TWO-CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; VF-TWO-CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2
; VF-TWO-CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; VF-TWO-CHECK-NEXT: [[IND_END:%.*]] = trunc i64 [[N_VEC17]] to i32
; VF-TWO-CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; VF-TWO-CHECK: vec.epilog.vector.body:
-; VF-TWO-CHECK-NEXT: [[OFFSET_IDX23:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT26:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; VF-TWO-CHECK-NEXT: [[OFFSET_IDX22:%.*]] = trunc i64 [[OFFSET_IDX23]] to i32
-; VF-TWO-CHECK-NEXT: [[TMP113:%.*]] = add i32 [[OFFSET_IDX22]], 0
-; VF-TWO-CHECK-NEXT: [[TMP114:%.*]] = add i64 [[OFFSET_IDX23]], 0
-; VF-TWO-CHECK-NEXT: [[TMP115:%.*]] = xor i32 [[TMP113]], -1
-; VF-TWO-CHECK-NEXT: [[TMP116:%.*]] = add i32 [[TMP115]], [[N]]
-; VF-TWO-CHECK-NEXT: [[TMP117:%.*]] = sext i32 [[TMP116]] to i64
-; VF-TWO-CHECK-NEXT: [[TMP118:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP117]]
-; VF-TWO-CHECK-NEXT: [[TMP119:%.*]] = getelementptr inbounds float, float* [[TMP118]], i32 0
-; VF-TWO-CHECK-NEXT: [[TMP120:%.*]] = getelementptr inbounds float, float* [[TMP119]], i32 -1
-; VF-TWO-CHECK-NEXT: [[TMP121:%.*]] = bitcast float* [[TMP120]] to <2 x float>*
-; VF-TWO-CHECK-NEXT: [[WIDE_LOAD24:%.*]] = load <2 x float>, <2 x float>* [[TMP121]], align 4
-; VF-TWO-CHECK-NEXT: [[REVERSE25:%.*]] = shufflevector <2 x float> [[WIDE_LOAD24]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; VF-TWO-CHECK-NEXT: [[TMP122:%.*]] = fadd fast <2 x float> [[REVERSE25]], <float 1.000000e+00, float 1.000000e+00>
-; VF-TWO-CHECK-NEXT: [[TMP123:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP114]]
-; VF-TWO-CHECK-NEXT: [[TMP124:%.*]] = getelementptr inbounds float, float* [[TMP123]], i32 0
+; VF-TWO-CHECK-NEXT: [[INDEX18:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT19:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; VF-TWO-CHECK-NEXT: [[OFFSET_IDX23:%.*]] = trunc i64 [[INDEX18]] to i32
+; VF-TWO-CHECK-NEXT: [[TMP118:%.*]] = add i32 [[OFFSET_IDX23]], 0
+; VF-TWO-CHECK-NEXT: [[TMP117:%.*]] = add i64 [[INDEX18]], 0
+; VF-TWO-CHECK-NEXT: [[TMP119:%.*]] = xor i32 [[TMP118]], -1
+; VF-TWO-CHECK-NEXT: [[TMP120:%.*]] = add i32 [[TMP119]], [[N]]
+; VF-TWO-CHECK-NEXT: [[TMP121:%.*]] = sext i32 [[TMP120]] to i64
+; VF-TWO-CHECK-NEXT: [[TMP122:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP121]]
+; VF-TWO-CHECK-NEXT: [[TMP123:%.*]] = getelementptr inbounds float, float* [[TMP122]], i32 0
+; VF-TWO-CHECK-NEXT: [[TMP124:%.*]] = getelementptr inbounds float, float* [[TMP123]], i32 -1
; VF-TWO-CHECK-NEXT: [[TMP125:%.*]] = bitcast float* [[TMP124]] to <2 x float>*
-; VF-TWO-CHECK-NEXT: store <2 x float> [[TMP122]], <2 x float>* [[TMP125]], align 4
-; VF-TWO-CHECK-NEXT: [[INDEX_NEXT26]] = add nuw i64 [[OFFSET_IDX23]], 2
-; VF-TWO-CHECK-NEXT: [[TMP126:%.*]] = icmp eq i64 [[INDEX_NEXT26]], [[N_VEC17]]
-; VF-TWO-CHECK-NEXT: br i1 [[TMP126]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF-TWO-CHECK-NEXT: [[WIDE_LOAD24:%.*]] = load <2 x float>, <2 x float>* [[TMP125]], align 4
+; VF-TWO-CHECK-NEXT: [[REVERSE25:%.*]] = shufflevector <2 x float> [[WIDE_LOAD24]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
+; VF-TWO-CHECK-NEXT: [[TMP126:%.*]] = fadd fast <2 x float> [[REVERSE25]], <float 1.000000e+00, float 1.000000e+00>
+; VF-TWO-CHECK-NEXT: [[TMP127:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP117]]
+; VF-TWO-CHECK-NEXT: [[TMP128:%.*]] = getelementptr inbounds float, float* [[TMP127]], i32 0
+; VF-TWO-CHECK-NEXT: [[TMP129:%.*]] = bitcast float* [[TMP128]] to <2 x float>*
+; VF-TWO-CHECK-NEXT: store <2 x float> [[TMP126]], <2 x float>* [[TMP129]], align 4
+; VF-TWO-CHECK-NEXT: [[INDEX_NEXT19]] = add nuw i64 [[INDEX18]], 2
+; VF-TWO-CHECK-NEXT: [[TMP130:%.*]] = icmp eq i64 [[INDEX_NEXT19]], [[N_VEC17]]
+; VF-TWO-CHECK-NEXT: br i1 [[TMP130]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; VF-TWO-CHECK: vec.epilog.middle.block:
-; VF-TWO-CHECK-NEXT: [[CMP_N20:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC17]]
-; VF-TWO-CHECK-NEXT: br i1 [[CMP_N20]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
+; VF-TWO-CHECK-NEXT: [[CMP_N22:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC17]]
+; VF-TWO-CHECK-NEXT: br i1 [[CMP_N22]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
; VF-TWO-CHECK: vec.epilog.scalar.ph:
; VF-TWO-CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC17]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ]
-; VF-TWO-CHECK-NEXT: [[BC_RESUME_VAL18:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END19]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ]
+; VF-TWO-CHECK-NEXT: [[BC_RESUME_VAL20:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END21]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ]
; VF-TWO-CHECK-NEXT: br label [[FOR_BODY:%.*]]
; VF-TWO-CHECK: for.body:
; VF-TWO-CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; VF-TWO-CHECK-NEXT: [[I_014:%.*]] = phi i32 [ [[BC_RESUME_VAL18]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; VF-TWO-CHECK-NEXT: [[TMP127:%.*]] = xor i32 [[I_014]], -1
-; VF-TWO-CHECK-NEXT: [[SUB2:%.*]] = add i32 [[TMP127]], [[N]]
+; VF-TWO-CHECK-NEXT: [[I_014:%.*]] = phi i32 [ [[BC_RESUME_VAL20]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; VF-TWO-CHECK-NEXT: [[TMP131:%.*]] = xor i32 [[I_014]], -1
+; VF-TWO-CHECK-NEXT: [[SUB2:%.*]] = add i32 [[TMP131]], [[N]]
; VF-TWO-CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[SUB2]] to i64
; VF-TWO-CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IDXPROM]]
-; VF-TWO-CHECK-NEXT: [[TMP128:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; VF-TWO-CHECK-NEXT: [[CONV3:%.*]] = fadd fast float [[TMP128]], 1.000000e+00
+; VF-TWO-CHECK-NEXT: [[TMP132:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; VF-TWO-CHECK-NEXT: [[CONV3:%.*]] = fadd fast float [[TMP132]], 1.000000e+00
; VF-TWO-CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
; VF-TWO-CHECK-NEXT: store float [[CONV3]], float* [[ARRAYIDX5]], align 4
; VF-TWO-CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; VF-FOUR-CHECK-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP2]])
; VF-FOUR-CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
; VF-FOUR-CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
-; VF-FOUR-CHECK-NEXT: [[TMP3:%.*]] = sub i32 [[TMP0]], [[MUL_RESULT]]
-; VF-FOUR-CHECK-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP3]], [[TMP0]]
-; VF-FOUR-CHECK-NEXT: [[TMP5:%.*]] = or i1 [[TMP4]], [[MUL_OVERFLOW]]
-; VF-FOUR-CHECK-NEXT: [[TMP6:%.*]] = icmp ugt i64 [[TMP1]], 4294967295
-; VF-FOUR-CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
-; VF-FOUR-CHECK-NEXT: br i1 [[TMP7]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; VF-FOUR-CHECK-NEXT: [[TMP4:%.*]] = sub i32 [[TMP0]], [[MUL_RESULT]]
+; VF-FOUR-CHECK-NEXT: [[TMP5:%.*]] = icmp sgt i32 [[TMP4]], [[TMP0]]
+; VF-FOUR-CHECK-NEXT: [[TMP9:%.*]] = or i1 [[TMP5]], [[MUL_OVERFLOW]]
+; VF-FOUR-CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[TMP1]], 4294967295
+; VF-FOUR-CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP9]], [[TMP8]]
+; VF-FOUR-CHECK-NEXT: br i1 [[TMP10]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
; VF-FOUR-CHECK: vector.main.loop.iter.check:
; VF-FOUR-CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 32
; VF-FOUR-CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
; VF-FOUR-CHECK: vector.body:
; VF-FOUR-CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; VF-FOUR-CHECK-NEXT: [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32
-; VF-FOUR-CHECK-NEXT: [[TMP8:%.*]] = add i32 [[OFFSET_IDX]], 0
-; VF-FOUR-CHECK-NEXT: [[TMP9:%.*]] = add i32 [[OFFSET_IDX]], 4
-; VF-FOUR-CHECK-NEXT: [[TMP10:%.*]] = add i32 [[OFFSET_IDX]], 8
-; VF-FOUR-CHECK-NEXT: [[TMP11:%.*]] = add i32 [[OFFSET_IDX]], 12
-; VF-FOUR-CHECK-NEXT: [[TMP12:%.*]] = add i32 [[OFFSET_IDX]], 16
-; VF-FOUR-CHECK-NEXT: [[TMP13:%.*]] = add i32 [[OFFSET_IDX]], 20
-; VF-FOUR-CHECK-NEXT: [[TMP14:%.*]] = add i32 [[OFFSET_IDX]], 24
-; VF-FOUR-CHECK-NEXT: [[TMP15:%.*]] = add i32 [[OFFSET_IDX]], 28
-; VF-FOUR-CHECK-NEXT: [[TMP16:%.*]] = add i64 [[INDEX]], 0
-; VF-FOUR-CHECK-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 4
-; VF-FOUR-CHECK-NEXT: [[TMP18:%.*]] = add i64 [[INDEX]], 8
-; VF-FOUR-CHECK-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], 12
-; VF-FOUR-CHECK-NEXT: [[TMP20:%.*]] = add i64 [[INDEX]], 16
-; VF-FOUR-CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 20
-; VF-FOUR-CHECK-NEXT: [[TMP22:%.*]] = add i64 [[INDEX]], 24
-; VF-FOUR-CHECK-NEXT: [[TMP23:%.*]] = add i64 [[INDEX]], 28
-; VF-FOUR-CHECK-NEXT: [[TMP24:%.*]] = xor i32 [[TMP8]], -1
-; VF-FOUR-CHECK-NEXT: [[TMP25:%.*]] = xor i32 [[TMP9]], -1
-; VF-FOUR-CHECK-NEXT: [[TMP26:%.*]] = xor i32 [[TMP10]], -1
-; VF-FOUR-CHECK-NEXT: [[TMP27:%.*]] = xor i32 [[TMP11]], -1
-; VF-FOUR-CHECK-NEXT: [[TMP28:%.*]] = xor i32 [[TMP12]], -1
-; VF-FOUR-CHECK-NEXT: [[TMP29:%.*]] = xor i32 [[TMP13]], -1
-; VF-FOUR-CHECK-NEXT: [[TMP30:%.*]] = xor i32 [[TMP14]], -1
-; VF-FOUR-CHECK-NEXT: [[TMP31:%.*]] = xor i32 [[TMP15]], -1
-; VF-FOUR-CHECK-NEXT: [[TMP32:%.*]] = add i32 [[TMP24]], [[N]]
-; VF-FOUR-CHECK-NEXT: [[TMP33:%.*]] = add i32 [[TMP25]], [[N]]
-; VF-FOUR-CHECK-NEXT: [[TMP34:%.*]] = add i32 [[TMP26]], [[N]]
-; VF-FOUR-CHECK-NEXT: [[TMP35:%.*]] = add i32 [[TMP27]], [[N]]
+; VF-FOUR-CHECK-NEXT: [[TMP20:%.*]] = add i32 [[OFFSET_IDX]], 0
+; VF-FOUR-CHECK-NEXT: [[TMP21:%.*]] = add i32 [[OFFSET_IDX]], 4
+; VF-FOUR-CHECK-NEXT: [[TMP22:%.*]] = add i32 [[OFFSET_IDX]], 8
+; VF-FOUR-CHECK-NEXT: [[TMP23:%.*]] = add i32 [[OFFSET_IDX]], 12
+; VF-FOUR-CHECK-NEXT: [[TMP24:%.*]] = add i32 [[OFFSET_IDX]], 16
+; VF-FOUR-CHECK-NEXT: [[TMP25:%.*]] = add i32 [[OFFSET_IDX]], 20
+; VF-FOUR-CHECK-NEXT: [[TMP26:%.*]] = add i32 [[OFFSET_IDX]], 24
+; VF-FOUR-CHECK-NEXT: [[TMP27:%.*]] = add i32 [[OFFSET_IDX]], 28
+; VF-FOUR-CHECK-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; VF-FOUR-CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 4
+; VF-FOUR-CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 8
+; VF-FOUR-CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 12
+; VF-FOUR-CHECK-NEXT: [[TMP16:%.*]] = add i64 [[INDEX]], 16
+; VF-FOUR-CHECK-NEXT: [[TMP17:%.*]] = add i64 [[INDEX]], 20
+; VF-FOUR-CHECK-NEXT: [[TMP18:%.*]] = add i64 [[INDEX]], 24
+; VF-FOUR-CHECK-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], 28
+; VF-FOUR-CHECK-NEXT: [[TMP28:%.*]] = xor i32 [[TMP20]], -1
+; VF-FOUR-CHECK-NEXT: [[TMP29:%.*]] = xor i32 [[TMP21]], -1
+; VF-FOUR-CHECK-NEXT: [[TMP30:%.*]] = xor i32 [[TMP22]], -1
+; VF-FOUR-CHECK-NEXT: [[TMP31:%.*]] = xor i32 [[TMP23]], -1
+; VF-FOUR-CHECK-NEXT: [[TMP32:%.*]] = xor i32 [[TMP24]], -1
+; VF-FOUR-CHECK-NEXT: [[TMP33:%.*]] = xor i32 [[TMP25]], -1
+; VF-FOUR-CHECK-NEXT: [[TMP34:%.*]] = xor i32 [[TMP26]], -1
+; VF-FOUR-CHECK-NEXT: [[TMP35:%.*]] = xor i32 [[TMP27]], -1
; VF-FOUR-CHECK-NEXT: [[TMP36:%.*]] = add i32 [[TMP28]], [[N]]
; VF-FOUR-CHECK-NEXT: [[TMP37:%.*]] = add i32 [[TMP29]], [[N]]
; VF-FOUR-CHECK-NEXT: [[TMP38:%.*]] = add i32 [[TMP30]], [[N]]
; VF-FOUR-CHECK-NEXT: [[TMP39:%.*]] = add i32 [[TMP31]], [[N]]
-; VF-FOUR-CHECK-NEXT: [[TMP40:%.*]] = sext i32 [[TMP32]] to i64
-; VF-FOUR-CHECK-NEXT: [[TMP41:%.*]] = sext i32 [[TMP33]] to i64
-; VF-FOUR-CHECK-NEXT: [[TMP42:%.*]] = sext i32 [[TMP34]] to i64
-; VF-FOUR-CHECK-NEXT: [[TMP43:%.*]] = sext i32 [[TMP35]] to i64
+; VF-FOUR-CHECK-NEXT: [[TMP40:%.*]] = add i32 [[TMP32]], [[N]]
+; VF-FOUR-CHECK-NEXT: [[TMP41:%.*]] = add i32 [[TMP33]], [[N]]
+; VF-FOUR-CHECK-NEXT: [[TMP42:%.*]] = add i32 [[TMP34]], [[N]]
+; VF-FOUR-CHECK-NEXT: [[TMP43:%.*]] = add i32 [[TMP35]], [[N]]
; VF-FOUR-CHECK-NEXT: [[TMP44:%.*]] = sext i32 [[TMP36]] to i64
; VF-FOUR-CHECK-NEXT: [[TMP45:%.*]] = sext i32 [[TMP37]] to i64
; VF-FOUR-CHECK-NEXT: [[TMP46:%.*]] = sext i32 [[TMP38]] to i64
; VF-FOUR-CHECK-NEXT: [[TMP47:%.*]] = sext i32 [[TMP39]] to i64
-; VF-FOUR-CHECK-NEXT: [[TMP48:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP40]]
-; VF-FOUR-CHECK-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP41]]
-; VF-FOUR-CHECK-NEXT: [[TMP50:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP42]]
-; VF-FOUR-CHECK-NEXT: [[TMP51:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP43]]
-; VF-FOUR-CHECK-NEXT: [[TMP52:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP44]]
+; VF-FOUR-CHECK-NEXT: [[TMP48:%.*]] = sext i32 [[TMP40]] to i64
+; VF-FOUR-CHECK-NEXT: [[TMP49:%.*]] = sext i32 [[TMP41]] to i64
+; VF-FOUR-CHECK-NEXT: [[TMP50:%.*]] = sext i32 [[TMP42]] to i64
+; VF-FOUR-CHECK-NEXT: [[TMP51:%.*]] = sext i32 [[TMP43]] to i64
+; VF-FOUR-CHECK-NEXT: [[TMP52:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP44]]
; VF-FOUR-CHECK-NEXT: [[TMP53:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP45]]
; VF-FOUR-CHECK-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP46]]
; VF-FOUR-CHECK-NEXT: [[TMP55:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP47]]
-; VF-FOUR-CHECK-NEXT: [[TMP56:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 0
-; VF-FOUR-CHECK-NEXT: [[TMP57:%.*]] = getelementptr inbounds float, float* [[TMP56]], i32 -3
-; VF-FOUR-CHECK-NEXT: [[TMP58:%.*]] = bitcast float* [[TMP57]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP58]], align 4
+; VF-FOUR-CHECK-NEXT: [[TMP56:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP48]]
+; VF-FOUR-CHECK-NEXT: [[TMP57:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP49]]
+; VF-FOUR-CHECK-NEXT: [[TMP58:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP50]]
+; VF-FOUR-CHECK-NEXT: [[TMP59:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP51]]
+; VF-FOUR-CHECK-NEXT: [[TMP60:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 0
+; VF-FOUR-CHECK-NEXT: [[TMP61:%.*]] = getelementptr inbounds float, float* [[TMP60]], i32 -3
+; VF-FOUR-CHECK-NEXT: [[TMP62:%.*]] = bitcast float* [[TMP61]] to <4 x float>*
+; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP62]], align 4
; VF-FOUR-CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x float> [[WIDE_LOAD]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-FOUR-CHECK-NEXT: [[TMP59:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -4
-; VF-FOUR-CHECK-NEXT: [[TMP60:%.*]] = getelementptr inbounds float, float* [[TMP59]], i32 -3
-; VF-FOUR-CHECK-NEXT: [[TMP61:%.*]] = bitcast float* [[TMP60]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, <4 x float>* [[TMP61]], align 4
+; VF-FOUR-CHECK-NEXT: [[TMP63:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -4
+; VF-FOUR-CHECK-NEXT: [[TMP64:%.*]] = getelementptr inbounds float, float* [[TMP63]], i32 -3
+; VF-FOUR-CHECK-NEXT: [[TMP65:%.*]] = bitcast float* [[TMP64]] to <4 x float>*
+; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, <4 x float>* [[TMP65]], align 4
; VF-FOUR-CHECK-NEXT: [[REVERSE3:%.*]] = shufflevector <4 x float> [[WIDE_LOAD2]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-FOUR-CHECK-NEXT: [[TMP62:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -8
-; VF-FOUR-CHECK-NEXT: [[TMP63:%.*]] = getelementptr inbounds float, float* [[TMP62]], i32 -3
-; VF-FOUR-CHECK-NEXT: [[TMP64:%.*]] = bitcast float* [[TMP63]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x float>, <4 x float>* [[TMP64]], align 4
+; VF-FOUR-CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -8
+; VF-FOUR-CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds float, float* [[TMP66]], i32 -3
+; VF-FOUR-CHECK-NEXT: [[TMP68:%.*]] = bitcast float* [[TMP67]] to <4 x float>*
+; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x float>, <4 x float>* [[TMP68]], align 4
; VF-FOUR-CHECK-NEXT: [[REVERSE5:%.*]] = shufflevector <4 x float> [[WIDE_LOAD4]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-FOUR-CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -12
-; VF-FOUR-CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds float, float* [[TMP65]], i32 -3
-; VF-FOUR-CHECK-NEXT: [[TMP67:%.*]] = bitcast float* [[TMP66]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x float>, <4 x float>* [[TMP67]], align 4
+; VF-FOUR-CHECK-NEXT: [[TMP69:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -12
+; VF-FOUR-CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds float, float* [[TMP69]], i32 -3
+; VF-FOUR-CHECK-NEXT: [[TMP71:%.*]] = bitcast float* [[TMP70]] to <4 x float>*
+; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x float>, <4 x float>* [[TMP71]], align 4
; VF-FOUR-CHECK-NEXT: [[REVERSE7:%.*]] = shufflevector <4 x float> [[WIDE_LOAD6]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-FOUR-CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -16
-; VF-FOUR-CHECK-NEXT: [[TMP69:%.*]] = getelementptr inbounds float, float* [[TMP68]], i32 -3
-; VF-FOUR-CHECK-NEXT: [[TMP70:%.*]] = bitcast float* [[TMP69]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x float>, <4 x float>* [[TMP70]], align 4
+; VF-FOUR-CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -16
+; VF-FOUR-CHECK-NEXT: [[TMP73:%.*]] = getelementptr inbounds float, float* [[TMP72]], i32 -3
+; VF-FOUR-CHECK-NEXT: [[TMP74:%.*]] = bitcast float* [[TMP73]] to <4 x float>*
+; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x float>, <4 x float>* [[TMP74]], align 4
; VF-FOUR-CHECK-NEXT: [[REVERSE9:%.*]] = shufflevector <4 x float> [[WIDE_LOAD8]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-FOUR-CHECK-NEXT: [[TMP71:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -20
-; VF-FOUR-CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds float, float* [[TMP71]], i32 -3
-; VF-FOUR-CHECK-NEXT: [[TMP73:%.*]] = bitcast float* [[TMP72]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x float>, <4 x float>* [[TMP73]], align 4
+; VF-FOUR-CHECK-NEXT: [[TMP75:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -20
+; VF-FOUR-CHECK-NEXT: [[TMP76:%.*]] = getelementptr inbounds float, float* [[TMP75]], i32 -3
+; VF-FOUR-CHECK-NEXT: [[TMP77:%.*]] = bitcast float* [[TMP76]] to <4 x float>*
+; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x float>, <4 x float>* [[TMP77]], align 4
; VF-FOUR-CHECK-NEXT: [[REVERSE11:%.*]] = shufflevector <4 x float> [[WIDE_LOAD10]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-FOUR-CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -24
-; VF-FOUR-CHECK-NEXT: [[TMP75:%.*]] = getelementptr inbounds float, float* [[TMP74]], i32 -3
-; VF-FOUR-CHECK-NEXT: [[TMP76:%.*]] = bitcast float* [[TMP75]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, <4 x float>* [[TMP76]], align 4
+; VF-FOUR-CHECK-NEXT: [[TMP78:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -24
+; VF-FOUR-CHECK-NEXT: [[TMP79:%.*]] = getelementptr inbounds float, float* [[TMP78]], i32 -3
+; VF-FOUR-CHECK-NEXT: [[TMP80:%.*]] = bitcast float* [[TMP79]] to <4 x float>*
+; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, <4 x float>* [[TMP80]], align 4
; VF-FOUR-CHECK-NEXT: [[REVERSE13:%.*]] = shufflevector <4 x float> [[WIDE_LOAD12]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-FOUR-CHECK-NEXT: [[TMP77:%.*]] = getelementptr inbounds float, float* [[TMP48]], i32 -28
-; VF-FOUR-CHECK-NEXT: [[TMP78:%.*]] = getelementptr inbounds float, float* [[TMP77]], i32 -3
-; VF-FOUR-CHECK-NEXT: [[TMP79:%.*]] = bitcast float* [[TMP78]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x float>, <4 x float>* [[TMP79]], align 4
+; VF-FOUR-CHECK-NEXT: [[TMP81:%.*]] = getelementptr inbounds float, float* [[TMP52]], i32 -28
+; VF-FOUR-CHECK-NEXT: [[TMP82:%.*]] = getelementptr inbounds float, float* [[TMP81]], i32 -3
+; VF-FOUR-CHECK-NEXT: [[TMP83:%.*]] = bitcast float* [[TMP82]] to <4 x float>*
+; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x float>, <4 x float>* [[TMP83]], align 4
; VF-FOUR-CHECK-NEXT: [[REVERSE15:%.*]] = shufflevector <4 x float> [[WIDE_LOAD14]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-FOUR-CHECK-NEXT: [[TMP80:%.*]] = fadd fast <4 x float> [[REVERSE]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-FOUR-CHECK-NEXT: [[TMP81:%.*]] = fadd fast <4 x float> [[REVERSE3]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-FOUR-CHECK-NEXT: [[TMP82:%.*]] = fadd fast <4 x float> [[REVERSE5]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-FOUR-CHECK-NEXT: [[TMP83:%.*]] = fadd fast <4 x float> [[REVERSE7]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-FOUR-CHECK-NEXT: [[TMP84:%.*]] = fadd fast <4 x float> [[REVERSE9]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-FOUR-CHECK-NEXT: [[TMP85:%.*]] = fadd fast <4 x float> [[REVERSE11]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-FOUR-CHECK-NEXT: [[TMP86:%.*]] = fadd fast <4 x float> [[REVERSE13]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-FOUR-CHECK-NEXT: [[TMP87:%.*]] = fadd fast <4 x float> [[REVERSE15]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-FOUR-CHECK-NEXT: [[TMP88:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP16]]
-; VF-FOUR-CHECK-NEXT: [[TMP89:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP17]]
-; VF-FOUR-CHECK-NEXT: [[TMP90:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP18]]
-; VF-FOUR-CHECK-NEXT: [[TMP91:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP19]]
-; VF-FOUR-CHECK-NEXT: [[TMP92:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP20]]
-; VF-FOUR-CHECK-NEXT: [[TMP93:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP21]]
-; VF-FOUR-CHECK-NEXT: [[TMP94:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP22]]
-; VF-FOUR-CHECK-NEXT: [[TMP95:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP23]]
-; VF-FOUR-CHECK-NEXT: [[TMP96:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 0
-; VF-FOUR-CHECK-NEXT: [[TMP97:%.*]] = bitcast float* [[TMP96]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP80]], <4 x float>* [[TMP97]], align 4
-; VF-FOUR-CHECK-NEXT: [[TMP98:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 4
-; VF-FOUR-CHECK-NEXT: [[TMP99:%.*]] = bitcast float* [[TMP98]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP81]], <4 x float>* [[TMP99]], align 4
-; VF-FOUR-CHECK-NEXT: [[TMP100:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 8
+; VF-FOUR-CHECK-NEXT: [[TMP84:%.*]] = fadd fast <4 x float> [[REVERSE]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-FOUR-CHECK-NEXT: [[TMP85:%.*]] = fadd fast <4 x float> [[REVERSE3]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-FOUR-CHECK-NEXT: [[TMP86:%.*]] = fadd fast <4 x float> [[REVERSE5]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-FOUR-CHECK-NEXT: [[TMP87:%.*]] = fadd fast <4 x float> [[REVERSE7]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-FOUR-CHECK-NEXT: [[TMP88:%.*]] = fadd fast <4 x float> [[REVERSE9]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-FOUR-CHECK-NEXT: [[TMP89:%.*]] = fadd fast <4 x float> [[REVERSE11]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-FOUR-CHECK-NEXT: [[TMP90:%.*]] = fadd fast <4 x float> [[REVERSE13]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-FOUR-CHECK-NEXT: [[TMP91:%.*]] = fadd fast <4 x float> [[REVERSE15]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-FOUR-CHECK-NEXT: [[TMP92:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP12]]
+; VF-FOUR-CHECK-NEXT: [[TMP93:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP13]]
+; VF-FOUR-CHECK-NEXT: [[TMP94:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP14]]
+; VF-FOUR-CHECK-NEXT: [[TMP95:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP15]]
+; VF-FOUR-CHECK-NEXT: [[TMP96:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP16]]
+; VF-FOUR-CHECK-NEXT: [[TMP97:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP17]]
+; VF-FOUR-CHECK-NEXT: [[TMP98:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP18]]
+; VF-FOUR-CHECK-NEXT: [[TMP99:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP19]]
+; VF-FOUR-CHECK-NEXT: [[TMP100:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 0
; VF-FOUR-CHECK-NEXT: [[TMP101:%.*]] = bitcast float* [[TMP100]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP82]], <4 x float>* [[TMP101]], align 4
-; VF-FOUR-CHECK-NEXT: [[TMP102:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 12
+; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP84]], <4 x float>* [[TMP101]], align 4
+; VF-FOUR-CHECK-NEXT: [[TMP102:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 4
; VF-FOUR-CHECK-NEXT: [[TMP103:%.*]] = bitcast float* [[TMP102]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP83]], <4 x float>* [[TMP103]], align 4
-; VF-FOUR-CHECK-NEXT: [[TMP104:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 16
+; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP85]], <4 x float>* [[TMP103]], align 4
+; VF-FOUR-CHECK-NEXT: [[TMP104:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 8
; VF-FOUR-CHECK-NEXT: [[TMP105:%.*]] = bitcast float* [[TMP104]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP84]], <4 x float>* [[TMP105]], align 4
-; VF-FOUR-CHECK-NEXT: [[TMP106:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 20
+; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP86]], <4 x float>* [[TMP105]], align 4
+; VF-FOUR-CHECK-NEXT: [[TMP106:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 12
; VF-FOUR-CHECK-NEXT: [[TMP107:%.*]] = bitcast float* [[TMP106]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP85]], <4 x float>* [[TMP107]], align 4
-; VF-FOUR-CHECK-NEXT: [[TMP108:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 24
+; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP87]], <4 x float>* [[TMP107]], align 4
+; VF-FOUR-CHECK-NEXT: [[TMP108:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 16
; VF-FOUR-CHECK-NEXT: [[TMP109:%.*]] = bitcast float* [[TMP108]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP86]], <4 x float>* [[TMP109]], align 4
-; VF-FOUR-CHECK-NEXT: [[TMP110:%.*]] = getelementptr inbounds float, float* [[TMP88]], i32 28
+; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP88]], <4 x float>* [[TMP109]], align 4
+; VF-FOUR-CHECK-NEXT: [[TMP110:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 20
; VF-FOUR-CHECK-NEXT: [[TMP111:%.*]] = bitcast float* [[TMP110]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP87]], <4 x float>* [[TMP111]], align 4
+; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP89]], <4 x float>* [[TMP111]], align 4
+; VF-FOUR-CHECK-NEXT: [[TMP112:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 24
+; VF-FOUR-CHECK-NEXT: [[TMP113:%.*]] = bitcast float* [[TMP112]] to <4 x float>*
+; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP90]], <4 x float>* [[TMP113]], align 4
+; VF-FOUR-CHECK-NEXT: [[TMP114:%.*]] = getelementptr inbounds float, float* [[TMP92]], i32 28
+; VF-FOUR-CHECK-NEXT: [[TMP115:%.*]] = bitcast float* [[TMP114]] to <4 x float>*
+; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP91]], <4 x float>* [[TMP115]], align 4
; VF-FOUR-CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; VF-FOUR-CHECK-NEXT: [[TMP112:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; VF-FOUR-CHECK-NEXT: br i1 [[TMP112]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; VF-FOUR-CHECK-NEXT: [[TMP116:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; VF-FOUR-CHECK-NEXT: br i1 [[TMP116]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOPID_MV_CM:![0-9]+]]
; VF-FOUR-CHECK: middle.block:
; VF-FOUR-CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; VF-FOUR-CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; VF-FOUR-CHECK: vec.epilog.iter.check:
-; VF-FOUR-CHECK-NEXT: [[IND_END19:%.*]] = trunc i64 [[N_VEC]] to i32
+; VF-FOUR-CHECK-NEXT: [[IND_END21:%.*]] = trunc i64 [[N_VEC]] to i32
; VF-FOUR-CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; VF-FOUR-CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
; VF-FOUR-CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; VF-FOUR-CHECK-NEXT: [[IND_END:%.*]] = trunc i64 [[N_VEC17]] to i32
; VF-FOUR-CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; VF-FOUR-CHECK: vec.epilog.vector.body:
-; VF-FOUR-CHECK-NEXT: [[OFFSET_IDX23:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT26:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; VF-FOUR-CHECK-NEXT: [[OFFSET_IDX22:%.*]] = trunc i64 [[OFFSET_IDX23]] to i32
-; VF-FOUR-CHECK-NEXT: [[TMP113:%.*]] = add i32 [[OFFSET_IDX22]], 0
-; VF-FOUR-CHECK-NEXT: [[TMP114:%.*]] = add i64 [[OFFSET_IDX23]], 0
-; VF-FOUR-CHECK-NEXT: [[TMP115:%.*]] = xor i32 [[TMP113]], -1
-; VF-FOUR-CHECK-NEXT: [[TMP116:%.*]] = add i32 [[TMP115]], [[N]]
-; VF-FOUR-CHECK-NEXT: [[TMP117:%.*]] = sext i32 [[TMP116]] to i64
-; VF-FOUR-CHECK-NEXT: [[TMP118:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP117]]
-; VF-FOUR-CHECK-NEXT: [[TMP119:%.*]] = getelementptr inbounds float, float* [[TMP118]], i32 0
-; VF-FOUR-CHECK-NEXT: [[TMP120:%.*]] = getelementptr inbounds float, float* [[TMP119]], i32 -3
-; VF-FOUR-CHECK-NEXT: [[TMP121:%.*]] = bitcast float* [[TMP120]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD24:%.*]] = load <4 x float>, <4 x float>* [[TMP121]], align 4
-; VF-FOUR-CHECK-NEXT: [[REVERSE25:%.*]] = shufflevector <4 x float> [[WIDE_LOAD24]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; VF-FOUR-CHECK-NEXT: [[TMP122:%.*]] = fadd fast <4 x float> [[REVERSE25]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; VF-FOUR-CHECK-NEXT: [[TMP123:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP114]]
-; VF-FOUR-CHECK-NEXT: [[TMP124:%.*]] = getelementptr inbounds float, float* [[TMP123]], i32 0
+; VF-FOUR-CHECK-NEXT: [[INDEX18:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT19:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; VF-FOUR-CHECK-NEXT: [[OFFSET_IDX23:%.*]] = trunc i64 [[INDEX18]] to i32
+; VF-FOUR-CHECK-NEXT: [[TMP118:%.*]] = add i32 [[OFFSET_IDX23]], 0
+; VF-FOUR-CHECK-NEXT: [[TMP117:%.*]] = add i64 [[INDEX18]], 0
+; VF-FOUR-CHECK-NEXT: [[TMP119:%.*]] = xor i32 [[TMP118]], -1
+; VF-FOUR-CHECK-NEXT: [[TMP120:%.*]] = add i32 [[TMP119]], [[N]]
+; VF-FOUR-CHECK-NEXT: [[TMP121:%.*]] = sext i32 [[TMP120]] to i64
+; VF-FOUR-CHECK-NEXT: [[TMP122:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP121]]
+; VF-FOUR-CHECK-NEXT: [[TMP123:%.*]] = getelementptr inbounds float, float* [[TMP122]], i32 0
+; VF-FOUR-CHECK-NEXT: [[TMP124:%.*]] = getelementptr inbounds float, float* [[TMP123]], i32 -3
; VF-FOUR-CHECK-NEXT: [[TMP125:%.*]] = bitcast float* [[TMP124]] to <4 x float>*
-; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP122]], <4 x float>* [[TMP125]], align 4
-; VF-FOUR-CHECK-NEXT: [[INDEX_NEXT26]] = add nuw i64 [[OFFSET_IDX23]], 4
-; VF-FOUR-CHECK-NEXT: [[TMP126:%.*]] = icmp eq i64 [[INDEX_NEXT26]], [[N_VEC17]]
-; VF-FOUR-CHECK-NEXT: br i1 [[TMP126]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD24:%.*]] = load <4 x float>, <4 x float>* [[TMP125]], align 4
+; VF-FOUR-CHECK-NEXT: [[REVERSE25:%.*]] = shufflevector <4 x float> [[WIDE_LOAD24]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; VF-FOUR-CHECK-NEXT: [[TMP126:%.*]] = fadd fast <4 x float> [[REVERSE25]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; VF-FOUR-CHECK-NEXT: [[TMP127:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP117]]
+; VF-FOUR-CHECK-NEXT: [[TMP128:%.*]] = getelementptr inbounds float, float* [[TMP127]], i32 0
+; VF-FOUR-CHECK-NEXT: [[TMP129:%.*]] = bitcast float* [[TMP128]] to <4 x float>*
+; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP126]], <4 x float>* [[TMP129]], align 4
+; VF-FOUR-CHECK-NEXT: [[INDEX_NEXT19]] = add nuw i64 [[INDEX18]], 4
+; VF-FOUR-CHECK-NEXT: [[TMP130:%.*]] = icmp eq i64 [[INDEX_NEXT19]], [[N_VEC17]]
+; VF-FOUR-CHECK-NEXT: br i1 [[TMP130]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOPID_EV_CM:![0-9]+]]
; VF-FOUR-CHECK: vec.epilog.middle.block:
-; VF-FOUR-CHECK-NEXT: [[CMP_N20:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC17]]
-; VF-FOUR-CHECK-NEXT: br i1 [[CMP_N20]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
+; VF-FOUR-CHECK-NEXT: [[CMP_N22:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC17]]
+; VF-FOUR-CHECK-NEXT: br i1 [[CMP_N22]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
; VF-FOUR-CHECK: vec.epilog.scalar.ph:
; VF-FOUR-CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC17]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ]
-; VF-FOUR-CHECK-NEXT: [[BC_RESUME_VAL18:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END19]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ]
+; VF-FOUR-CHECK-NEXT: [[BC_RESUME_VAL20:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END21]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ]
; VF-FOUR-CHECK-NEXT: br label [[FOR_BODY:%.*]]
; VF-FOUR-CHECK: for.body:
; VF-FOUR-CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; VF-FOUR-CHECK-NEXT: [[I_014:%.*]] = phi i32 [ [[BC_RESUME_VAL18]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; VF-FOUR-CHECK-NEXT: [[TMP127:%.*]] = xor i32 [[I_014]], -1
-; VF-FOUR-CHECK-NEXT: [[SUB2:%.*]] = add i32 [[TMP127]], [[N]]
+; VF-FOUR-CHECK-NEXT: [[I_014:%.*]] = phi i32 [ [[BC_RESUME_VAL20]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; VF-FOUR-CHECK-NEXT: [[TMP131:%.*]] = xor i32 [[I_014]], -1
+; VF-FOUR-CHECK-NEXT: [[SUB2:%.*]] = add i32 [[TMP131]], [[N]]
; VF-FOUR-CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[SUB2]] to i64
; VF-FOUR-CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IDXPROM]]
-; VF-FOUR-CHECK-NEXT: [[TMP128:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; VF-FOUR-CHECK-NEXT: [[CONV3:%.*]] = fadd fast float [[TMP128]], 1.000000e+00
+; VF-FOUR-CHECK-NEXT: [[TMP132:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; VF-FOUR-CHECK-NEXT: [[CONV3:%.*]] = fadd fast float [[TMP132]], 1.000000e+00
; VF-FOUR-CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
; VF-FOUR-CHECK-NEXT: store float [[CONV3]], float* [[ARRAYIDX5]], align 4
; VF-FOUR-CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; VF-FOUR-CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_014]], 1
; VF-FOUR-CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; VF-FOUR-CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]]
+; VF-FOUR-CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOPID_MS_CM:![0-9]+]]
; VF-FOUR-CHECK: for.end.loopexit.loopexit:
; VF-FOUR-CHECK-NEXT: br label [[FOR_END_LOOPEXIT]]
; VF-FOUR-CHECK: for.end.loopexit:
ret i32 0
}
+; VF-TWO-CHECK-DAG: [[LOOPID_MV]] = distinct !{[[LOOPID_MV]], [[LOOPID_DISABLE_VECT:!.*]]}
+; VF-TWO-CHECK-DAG: [[LOOPID_EV]] = distinct !{[[LOOPID_EV]], [[LOOPID_DISABLE_VECT]], [[LOOPID_DISABLE_UNROLL:!.*]]}
+; VF-TWO-CHECK-DAG: [[LOOPID_DISABLE_VECT]] = [[DISABLE_VECT_STR:!{!"llvm.loop.isvectorized".*}.*]]
+; VF-TWO-CHECK-DAG: [[LOOPID_DISABLE_UNROLL]] = [[DISABLE_UNROLL_STR:!{!"llvm.loop.unroll.runtime.disable"}.*]]
;
+; VF-FOUR-CHECK-DAG: [[LOOPID_MV_CM]] = distinct !{[[LOOPID_MV_CM]], [[LOOPID_DISABLE_VECT_CM:!.*]]}
+; VF-FOUR-CHECK-DAG: [[LOOPID_EV_CM]] = distinct !{[[LOOPID_EV_CM]], [[LOOPID_DISABLE_VECT_CM]], [[LOOPID_DISABLE_UNROLL_CM:!.*]]}
+; VF-FOUR-CHECK-DAG: [[LOOPID_DISABLE_VECT_CM]] = [[DISABLE_VECT_STR_CM:!{!"llvm.loop.isvectorized".*}.*]]
+; VF-FOUR-CHECK-DAG: [[LOOPID_DISABLE_UNROLL_CM]] = [[DISABLE_UNROLL_STR_CM:!{!"llvm.loop.unroll.runtime.disable"}.*]]
;
attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-spe" "unsafe-fp-math"="true" "use-soft-float"="false" }
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -loop-vectorize -mcpu=pwr8 -mattr=+vsx -force-vector-interleave=1 -vectorizer-maximize-bandwidth=0 -S | FileCheck %s
target triple = "powerpc64-unknown-linux-gnu"
define signext i32 @foo(i8* readonly %ptr, i32 signext %l) {
-; CHECK-LABEL: @foo(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[PTR1:%.*]] = ptrtoint i8* [[PTR:%.*]] to i64
-; CHECK-NEXT: [[IDX_EXT:%.*]] = sext i32 [[L:%.*]] to i64
-; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[IDX_EXT]]
-; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[L]], 0
-; CHECK-NEXT: br i1 [[CMP7]], label [[WHILE_BODY_PREHEADER:%.*]], label [[WHILE_END:%.*]]
-; CHECK: while.body.preheader:
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[PTR1]], [[IDX_EXT]]
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[PTR1]], 1
-; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP0]], i64 [[TMP1]])
-; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[UMAX]], [[PTR1]]
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
-; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, i8* [[PTR]], i64 [[N_VEC]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[PTR]], i64 [[TMP3]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* [[NEXT_GEP]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <4 x i8>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP5]], align 1
-; CHECK-NEXT: [[TMP6:%.*]] = icmp slt <4 x i8> [[WIDE_LOAD]], <i8 -64, i8 -64, i8 -64, i8 -64>
-; CHECK-NEXT: [[TMP7:%.*]] = zext <4 x i1> [[TMP6]] to <4 x i32>
-; CHECK-NEXT: [[TMP8]] = add <4 x i32> [[TMP7]], [[VEC_PHI]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP8]])
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[PTR]], [[WHILE_BODY_PREHEADER]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[WHILE_BODY_PREHEADER]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[WHILE_BODY:%.*]]
-; CHECK: while.body:
-; CHECK-NEXT: [[COUNT_09:%.*]] = phi i32 [ [[ADD:%.*]], [[WHILE_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[PTR_ADDR_08:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[TMP11:%.*]] = load i8, i8* [[PTR_ADDR_08]], align 1
-; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i8 [[TMP11]], -64
-; CHECK-NEXT: [[COND:%.*]] = zext i1 [[CMP1]] to i32
-; CHECK-NEXT: [[ADD]] = add nsw i32 [[COND]], [[COUNT_09]]
-; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, i8* [[PTR_ADDR_08]], i64 1
-; CHECK-NEXT: [[CMP:%.*]] = icmp ult i8* [[INCDEC_PTR]], [[ADD_PTR]]
-; CHECK-NEXT: br i1 [[CMP]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: while.end.loopexit:
-; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[WHILE_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[WHILE_END]]
-; CHECK: while.end:
-; CHECK-NEXT: [[COUNT_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[WHILE_END_LOOPEXIT]] ]
-; CHECK-NEXT: ret i32 [[COUNT_0_LCSSA]]
-;
entry:
%idx.ext = sext i32 %l to i64
%add.ptr = getelementptr inbounds i8, i8* %ptr, i64 %idx.ext
%count.0.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %while.end.loopexit ]
ret i32 %count.0.lcssa
+; CHECK: load <4 x i8>
+; CHECK: icmp slt <4 x i8>
}
define signext i16 @foo2(i8* readonly %ptr, i32 signext %l) {
-; CHECK-LABEL: @foo2(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[PTR1:%.*]] = ptrtoint i8* [[PTR:%.*]] to i64
-; CHECK-NEXT: [[IDX_EXT:%.*]] = sext i32 [[L:%.*]] to i64
-; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i64 [[IDX_EXT]]
-; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[L]], 0
-; CHECK-NEXT: br i1 [[CMP7]], label [[WHILE_BODY_PREHEADER:%.*]], label [[WHILE_END:%.*]]
-; CHECK: while.body.preheader:
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[PTR1]], [[IDX_EXT]]
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[PTR1]], 1
-; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP0]], i64 [[TMP1]])
-; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[UMAX]], [[PTR1]]
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 8
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 8
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
-; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, i8* [[PTR]], i64 [[N_VEC]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[PTR]], i64 [[TMP3]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* [[NEXT_GEP]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <8 x i8>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, <8 x i8>* [[TMP5]], align 1
-; CHECK-NEXT: [[TMP6:%.*]] = icmp slt <8 x i8> [[WIDE_LOAD]], <i8 -64, i8 -64, i8 -64, i8 -64, i8 -64, i8 -64, i8 -64, i8 -64>
-; CHECK-NEXT: [[TMP7:%.*]] = zext <8 x i1> [[TMP6]] to <8 x i16>
-; CHECK-NEXT: [[TMP8]] = add <8 x i16> [[TMP7]], [[VEC_PHI]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[TMP10:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP8]])
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[PTR]], [[WHILE_BODY_PREHEADER]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i16 [ 0, [[WHILE_BODY_PREHEADER]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[WHILE_BODY:%.*]]
-; CHECK: while.body:
-; CHECK-NEXT: [[COUNT_09:%.*]] = phi i16 [ [[ADD:%.*]], [[WHILE_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[PTR_ADDR_08:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[TMP11:%.*]] = load i8, i8* [[PTR_ADDR_08]], align 1
-; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i8 [[TMP11]], -64
-; CHECK-NEXT: [[COND:%.*]] = zext i1 [[CMP1]] to i16
-; CHECK-NEXT: [[ADD]] = add nsw i16 [[COND]], [[COUNT_09]]
-; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, i8* [[PTR_ADDR_08]], i64 1
-; CHECK-NEXT: [[CMP:%.*]] = icmp ult i8* [[INCDEC_PTR]], [[ADD_PTR]]
-; CHECK-NEXT: br i1 [[CMP]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK: while.end.loopexit:
-; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i16 [ [[ADD]], [[WHILE_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[WHILE_END]]
-; CHECK: while.end:
-; CHECK-NEXT: [[COUNT_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[WHILE_END_LOOPEXIT]] ]
-; CHECK-NEXT: ret i16 [[COUNT_0_LCSSA]]
-;
entry:
- %idx.ext = sext i32 %l to i64
+ %idx.ext = sext i32 %l to i64
%add.ptr = getelementptr inbounds i8, i8* %ptr, i64 %idx.ext
%cmp7 = icmp sgt i32 %l, 0
br i1 %cmp7, label %while.body.preheader, label %while.end
%count.09 = phi i16 [ %add, %while.body ], [ 0, %while.body.preheader ]
%ptr.addr.08 = phi i8* [ %incdec.ptr, %while.body ], [ %ptr, %while.body.preheader ]
%0 = load i8, i8* %ptr.addr.08, align 1
- %cmp1 = icmp slt i8 %0, -64
- %cond = zext i1 %cmp1 to i16
+ %cmp1 = icmp slt i8 %0, -64
+ %cond = zext i1 %cmp1 to i16
%add = add nsw i16 %cond, %count.09
%incdec.ptr = getelementptr inbounds i8, i8* %ptr.addr.08, i64 1
%cmp = icmp ult i8* %incdec.ptr, %add.ptr
%count.0.lcssa = phi i16 [ 0, %entry ], [ %add.lcssa, %while.end.loopexit ]
ret i16 %count.0.lcssa
+; CHECK-LABEL: foo2
+; CHECK: load <8 x i8>
+; CHECK: icmp slt <8 x i8>
}
define signext i32 @foo3(i16* readonly %ptr, i32 signext %l) {
-; CHECK-LABEL: @foo3(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[PTR1:%.*]] = ptrtoint i16* [[PTR:%.*]] to i64
-; CHECK-NEXT: [[IDX_EXT:%.*]] = sext i32 [[L:%.*]] to i64
-; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i64 [[IDX_EXT]]
-; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[L]], 0
-; CHECK-NEXT: br i1 [[CMP7]], label [[WHILE_BODY_PREHEADER:%.*]], label [[WHILE_END:%.*]]
-; CHECK: while.body.preheader:
-; CHECK-NEXT: [[TMP0:%.*]] = shl nsw i64 [[IDX_EXT]], 1
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[PTR1]], [[TMP0]]
-; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[PTR1]], 2
-; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 [[TMP2]])
-; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[UMAX]], -1
-; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], [[PTR1]]
-; CHECK-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP4]], 1
-; CHECK-NEXT: [[TMP6:%.*]] = add nuw i64 [[TMP5]], 1
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP6]], 4
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP6]], 4
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP6]], [[N_MOD_VF]]
-; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i16, i16* [[PTR]], i64 [[N_VEC]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i16, i16* [[PTR]], i64 [[TMP7]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[NEXT_GEP]], i32 0
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16* [[TMP8]] to <4 x i16>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP9]], align 1
-; CHECK-NEXT: [[TMP10:%.*]] = icmp slt <4 x i16> [[WIDE_LOAD]], <i16 -64, i16 -64, i16 -64, i16 -64>
-; CHECK-NEXT: [[TMP11:%.*]] = zext <4 x i1> [[TMP10]] to <4 x i32>
-; CHECK-NEXT: [[TMP12]] = add <4 x i32> [[TMP11]], [[VEC_PHI]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP12]])
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP6]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[PTR]], [[WHILE_BODY_PREHEADER]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[WHILE_BODY_PREHEADER]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[WHILE_BODY:%.*]]
-; CHECK: while.body:
-; CHECK-NEXT: [[COUNT_09:%.*]] = phi i32 [ [[ADD:%.*]], [[WHILE_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[PTR_ADDR_16:%.*]] = phi i16* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[TMP15:%.*]] = load i16, i16* [[PTR_ADDR_16]], align 1
-; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i16 [[TMP15]], -64
-; CHECK-NEXT: [[COND:%.*]] = zext i1 [[CMP1]] to i32
-; CHECK-NEXT: [[ADD]] = add nsw i32 [[COND]], [[COUNT_09]]
-; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i16, i16* [[PTR_ADDR_16]], i64 1
-; CHECK-NEXT: [[CMP:%.*]] = icmp ult i16* [[INCDEC_PTR]], [[ADD_PTR]]
-; CHECK-NEXT: br i1 [[CMP]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK: while.end.loopexit:
-; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[WHILE_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[WHILE_END]]
-; CHECK: while.end:
-; CHECK-NEXT: [[COUNT_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[WHILE_END_LOOPEXIT]] ]
-; CHECK-NEXT: ret i32 [[COUNT_0_LCSSA]]
-;
entry:
- %idx.ext = sext i32 %l to i64
+ %idx.ext = sext i32 %l to i64
%add.ptr = getelementptr inbounds i16, i16* %ptr, i64 %idx.ext
%cmp7 = icmp sgt i32 %l, 0
br i1 %cmp7, label %while.body.preheader, label %while.end
%count.09 = phi i32 [ %add, %while.body ], [ 0, %while.body.preheader ]
%ptr.addr.16 = phi i16* [ %incdec.ptr, %while.body ], [ %ptr, %while.body.preheader ]
%0 = load i16, i16* %ptr.addr.16, align 1
- %cmp1 = icmp slt i16 %0, -64
- %cond = zext i1 %cmp1 to i32
+ %cmp1 = icmp slt i16 %0, -64
+ %cond = zext i1 %cmp1 to i32
%add = add nsw i32 %cond, %count.09
%incdec.ptr = getelementptr inbounds i16, i16* %ptr.addr.16, i64 1
%cmp = icmp ult i16* %incdec.ptr, %add.ptr
%count.0.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %while.end.loopexit ]
ret i32 %count.0.lcssa
+; CHECK-LABEL: foo3
+; CHECK: load <4 x i16>
+; CHECK: icmp slt <4 x i16>
}
define i64 @foo4(i16* readonly %ptr, i32 signext %l) {
-; CHECK-LABEL: @foo4(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[PTR1:%.*]] = ptrtoint i16* [[PTR:%.*]] to i64
-; CHECK-NEXT: [[IDX_EXT:%.*]] = sext i32 [[L:%.*]] to i64
-; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i64 [[IDX_EXT]]
-; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[L]], 0
-; CHECK-NEXT: br i1 [[CMP7]], label [[WHILE_BODY_PREHEADER:%.*]], label [[WHILE_END:%.*]]
-; CHECK: while.body.preheader:
-; CHECK-NEXT: [[TMP0:%.*]] = shl nsw i64 [[IDX_EXT]], 1
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[PTR1]], [[TMP0]]
-; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[PTR1]], 2
-; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 [[TMP2]])
-; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[UMAX]], -1
-; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], [[PTR1]]
-; CHECK-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP4]], 1
-; CHECK-NEXT: [[TMP6:%.*]] = add nuw i64 [[TMP5]], 1
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP6]], 2
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP6]], 2
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP6]], [[N_MOD_VF]]
-; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i16, i16* [[PTR]], i64 [[N_VEC]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i16, i16* [[PTR]], i64 [[TMP7]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i16, i16* [[NEXT_GEP]], i32 0
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i16* [[TMP8]] to <2 x i16>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i16>, <2 x i16>* [[TMP9]], align 1
-; CHECK-NEXT: [[TMP10:%.*]] = icmp slt <2 x i16> [[WIDE_LOAD]], <i16 -64, i16 -64>
-; CHECK-NEXT: [[TMP11:%.*]] = zext <2 x i1> [[TMP10]] to <2 x i64>
-; CHECK-NEXT: [[TMP12]] = add <2 x i64> [[TMP11]], [[VEC_PHI]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[TMP12]])
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP6]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[PTR]], [[WHILE_BODY_PREHEADER]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[WHILE_BODY_PREHEADER]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[WHILE_BODY:%.*]]
-; CHECK: while.body:
-; CHECK-NEXT: [[COUNT_09:%.*]] = phi i64 [ [[ADD:%.*]], [[WHILE_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[PTR_ADDR_16:%.*]] = phi i16* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[TMP15:%.*]] = load i16, i16* [[PTR_ADDR_16]], align 1
-; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i16 [[TMP15]], -64
-; CHECK-NEXT: [[COND:%.*]] = zext i1 [[CMP1]] to i64
-; CHECK-NEXT: [[ADD]] = add nsw i64 [[COND]], [[COUNT_09]]
-; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i16, i16* [[PTR_ADDR_16]], i64 1
-; CHECK-NEXT: [[CMP:%.*]] = icmp ult i16* [[INCDEC_PTR]], [[ADD_PTR]]
-; CHECK-NEXT: br i1 [[CMP]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK: while.end.loopexit:
-; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], [[WHILE_BODY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[WHILE_END]]
-; CHECK: while.end:
-; CHECK-NEXT: [[COUNT_0_LCSSA:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[WHILE_END_LOOPEXIT]] ]
-; CHECK-NEXT: ret i64 [[COUNT_0_LCSSA]]
-;
entry:
- %idx.ext = sext i32 %l to i64
+ %idx.ext = sext i32 %l to i64
%add.ptr = getelementptr inbounds i16, i16* %ptr, i64 %idx.ext
%cmp7 = icmp sgt i32 %l, 0
br i1 %cmp7, label %while.body.preheader, label %while.end
%count.09 = phi i64 [ %add, %while.body ], [ 0, %while.body.preheader ]
%ptr.addr.16 = phi i16* [ %incdec.ptr, %while.body ], [ %ptr, %while.body.preheader ]
%0 = load i16, i16* %ptr.addr.16, align 1
- %cmp1 = icmp slt i16 %0, -64
- %cond = zext i1 %cmp1 to i64
+ %cmp1 = icmp slt i16 %0, -64
+ %cond = zext i1 %cmp1 to i64
%add = add nsw i64 %cond, %count.09
%incdec.ptr = getelementptr inbounds i16, i16* %ptr.addr.16, i64 1
%cmp = icmp ult i16* %incdec.ptr, %add.ptr
%count.0.lcssa = phi i64 [ 0, %entry ], [ %add.lcssa, %while.end.loopexit ]
ret i64 %count.0.lcssa
+; CHECK-LABEL: foo4
+; CHECK: load <2 x i16>
+; CHECK: icmp slt <2 x i16>
}
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -debug-only=loop-vectorize -passes='function(loop-vectorize),default<O2>' -vectorizer-maximize-bandwidth -mtriple=powerpc64-unknown-linux -S -mcpu=pwr8 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-PWR8
; RUN: opt < %s -debug-only=loop-vectorize -passes='function(loop-vectorize),default<O2>' -vectorizer-maximize-bandwidth -mtriple=powerpc64le-unknown-linux -S -mcpu=pwr9 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-PWR9
; REQUIRES: asserts
@b = global [1024 x i8] zeroinitializer, align 16
define i32 @foo() {
-; CHECK-PWR8-LABEL: @foo(
-; CHECK-PWR8-NEXT: iter.check:
-; CHECK-PWR8-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-PWR8: vector.body:
-; CHECK-PWR8-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ITER_CHECK:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR8-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[ITER_CHECK]] ], [ [[TMP32:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR8-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[ITER_CHECK]] ], [ [[TMP33:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR8-NEXT: [[VEC_PHI2:%.*]] = phi <16 x i32> [ zeroinitializer, [[ITER_CHECK]] ], [ [[TMP34:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR8-NEXT: [[VEC_PHI3:%.*]] = phi <16 x i32> [ zeroinitializer, [[ITER_CHECK]] ], [ [[TMP35:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR8-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 [[INDEX]]
-; CHECK-PWR8-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
-; CHECK-PWR8-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[TMP1]], align 16
-; CHECK-PWR8-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 16
-; CHECK-PWR8-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>*
-; CHECK-PWR8-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, <16 x i8>* [[TMP3]], align 16
-; CHECK-PWR8-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 32
-; CHECK-PWR8-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <16 x i8>*
-; CHECK-PWR8-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, <16 x i8>* [[TMP5]], align 16
-; CHECK-PWR8-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 48
-; CHECK-PWR8-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <16 x i8>*
-; CHECK-PWR8-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, <16 x i8>* [[TMP7]], align 16
-; CHECK-PWR8-NEXT: [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-PWR8-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
-; CHECK-PWR8-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
-; CHECK-PWR8-NEXT: [[TMP11:%.*]] = zext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
-; CHECK-PWR8-NEXT: [[TMP12:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 [[INDEX]]
-; CHECK-PWR8-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <16 x i8>*
-; CHECK-PWR8-NEXT: [[WIDE_LOAD7:%.*]] = load <16 x i8>, <16 x i8>* [[TMP13]], align 16
-; CHECK-PWR8-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP12]], i64 16
-; CHECK-PWR8-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <16 x i8>*
-; CHECK-PWR8-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, <16 x i8>* [[TMP15]], align 16
-; CHECK-PWR8-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, i8* [[TMP12]], i64 32
-; CHECK-PWR8-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to <16 x i8>*
-; CHECK-PWR8-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, <16 x i8>* [[TMP17]], align 16
-; CHECK-PWR8-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, i8* [[TMP12]], i64 48
-; CHECK-PWR8-NEXT: [[TMP19:%.*]] = bitcast i8* [[TMP18]] to <16 x i8>*
-; CHECK-PWR8-NEXT: [[WIDE_LOAD10:%.*]] = load <16 x i8>, <16 x i8>* [[TMP19]], align 16
-; CHECK-PWR8-NEXT: [[TMP20:%.*]] = zext <16 x i8> [[WIDE_LOAD7]] to <16 x i32>
-; CHECK-PWR8-NEXT: [[TMP21:%.*]] = zext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
-; CHECK-PWR8-NEXT: [[TMP22:%.*]] = zext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
-; CHECK-PWR8-NEXT: [[TMP23:%.*]] = zext <16 x i8> [[WIDE_LOAD10]] to <16 x i32>
-; CHECK-PWR8-NEXT: [[TMP24:%.*]] = sub nsw <16 x i32> [[TMP8]], [[TMP20]]
-; CHECK-PWR8-NEXT: [[TMP25:%.*]] = sub nsw <16 x i32> [[TMP9]], [[TMP21]]
-; CHECK-PWR8-NEXT: [[TMP26:%.*]] = sub nsw <16 x i32> [[TMP10]], [[TMP22]]
-; CHECK-PWR8-NEXT: [[TMP27:%.*]] = sub nsw <16 x i32> [[TMP11]], [[TMP23]]
-; CHECK-PWR8-NEXT: [[TMP28:%.*]] = tail call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[TMP24]], i1 true)
-; CHECK-PWR8-NEXT: [[TMP29:%.*]] = tail call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[TMP25]], i1 true)
-; CHECK-PWR8-NEXT: [[TMP30:%.*]] = tail call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[TMP26]], i1 true)
-; CHECK-PWR8-NEXT: [[TMP31:%.*]] = tail call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[TMP27]], i1 true)
-; CHECK-PWR8-NEXT: [[TMP32]] = add <16 x i32> [[TMP28]], [[VEC_PHI]]
-; CHECK-PWR8-NEXT: [[TMP33]] = add <16 x i32> [[TMP29]], [[VEC_PHI1]]
-; CHECK-PWR8-NEXT: [[TMP34]] = add <16 x i32> [[TMP30]], [[VEC_PHI2]]
-; CHECK-PWR8-NEXT: [[TMP35]] = add <16 x i32> [[TMP31]], [[VEC_PHI3]]
-; CHECK-PWR8-NEXT: [[INDEX_NEXT]] = add nuw nsw i64 [[INDEX]], 64
-; CHECK-PWR8-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-PWR8-NEXT: br i1 [[TMP36]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK-PWR8: for.cond.cleanup:
-; CHECK-PWR8-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP33]], [[TMP32]]
-; CHECK-PWR8-NEXT: [[BIN_RDX11:%.*]] = add <16 x i32> [[BIN_RDX]], [[TMP34]]
-; CHECK-PWR8-NEXT: [[BIN_RDX12:%.*]] = add <16 x i32> [[BIN_RDX11]], [[TMP35]]
-; CHECK-PWR8-NEXT: [[TMP37:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX12]])
-; CHECK-PWR8-NEXT: ret i32 [[TMP37]]
-;
-; CHECK-PWR9-LABEL: @foo(
-; CHECK-PWR9-NEXT: entry:
-; CHECK-PWR9-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-PWR9: vector.body:
-; CHECK-PWR9-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR9-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP64:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR9-NEXT: [[VEC_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP65:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR9-NEXT: [[VEC_PHI2:%.*]] = phi <8 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP66:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR9-NEXT: [[VEC_PHI3:%.*]] = phi <8 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP67:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR9-NEXT: [[VEC_PHI4:%.*]] = phi <8 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP68:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR9-NEXT: [[VEC_PHI5:%.*]] = phi <8 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP69:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR9-NEXT: [[VEC_PHI6:%.*]] = phi <8 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP70:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR9-NEXT: [[VEC_PHI7:%.*]] = phi <8 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP71:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR9-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 [[INDEX]]
-; CHECK-PWR9-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>*
-; CHECK-PWR9-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 16
-; CHECK-PWR9-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 8
-; CHECK-PWR9-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <8 x i8>*
-; CHECK-PWR9-NEXT: [[WIDE_LOAD8:%.*]] = load <8 x i8>, <8 x i8>* [[TMP3]], align 8
-; CHECK-PWR9-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 16
-; CHECK-PWR9-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <8 x i8>*
-; CHECK-PWR9-NEXT: [[WIDE_LOAD9:%.*]] = load <8 x i8>, <8 x i8>* [[TMP5]], align 16
-; CHECK-PWR9-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 24
-; CHECK-PWR9-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <8 x i8>*
-; CHECK-PWR9-NEXT: [[WIDE_LOAD10:%.*]] = load <8 x i8>, <8 x i8>* [[TMP7]], align 8
-; CHECK-PWR9-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 32
-; CHECK-PWR9-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to <8 x i8>*
-; CHECK-PWR9-NEXT: [[WIDE_LOAD11:%.*]] = load <8 x i8>, <8 x i8>* [[TMP9]], align 16
-; CHECK-PWR9-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 40
-; CHECK-PWR9-NEXT: [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <8 x i8>*
-; CHECK-PWR9-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i8>, <8 x i8>* [[TMP11]], align 8
-; CHECK-PWR9-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 48
-; CHECK-PWR9-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <8 x i8>*
-; CHECK-PWR9-NEXT: [[WIDE_LOAD13:%.*]] = load <8 x i8>, <8 x i8>* [[TMP13]], align 16
-; CHECK-PWR9-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP0]], i64 56
-; CHECK-PWR9-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <8 x i8>*
-; CHECK-PWR9-NEXT: [[WIDE_LOAD14:%.*]] = load <8 x i8>, <8 x i8>* [[TMP15]], align 8
-; CHECK-PWR9-NEXT: [[TMP16:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
-; CHECK-PWR9-NEXT: [[TMP17:%.*]] = zext <8 x i8> [[WIDE_LOAD8]] to <8 x i32>
-; CHECK-PWR9-NEXT: [[TMP18:%.*]] = zext <8 x i8> [[WIDE_LOAD9]] to <8 x i32>
-; CHECK-PWR9-NEXT: [[TMP19:%.*]] = zext <8 x i8> [[WIDE_LOAD10]] to <8 x i32>
-; CHECK-PWR9-NEXT: [[TMP20:%.*]] = zext <8 x i8> [[WIDE_LOAD11]] to <8 x i32>
-; CHECK-PWR9-NEXT: [[TMP21:%.*]] = zext <8 x i8> [[WIDE_LOAD12]] to <8 x i32>
-; CHECK-PWR9-NEXT: [[TMP22:%.*]] = zext <8 x i8> [[WIDE_LOAD13]] to <8 x i32>
-; CHECK-PWR9-NEXT: [[TMP23:%.*]] = zext <8 x i8> [[WIDE_LOAD14]] to <8 x i32>
-; CHECK-PWR9-NEXT: [[TMP24:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 [[INDEX]]
-; CHECK-PWR9-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP24]] to <8 x i8>*
-; CHECK-PWR9-NEXT: [[WIDE_LOAD15:%.*]] = load <8 x i8>, <8 x i8>* [[TMP25]], align 16
-; CHECK-PWR9-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, i8* [[TMP24]], i64 8
-; CHECK-PWR9-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to <8 x i8>*
-; CHECK-PWR9-NEXT: [[WIDE_LOAD16:%.*]] = load <8 x i8>, <8 x i8>* [[TMP27]], align 8
-; CHECK-PWR9-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, i8* [[TMP24]], i64 16
-; CHECK-PWR9-NEXT: [[TMP29:%.*]] = bitcast i8* [[TMP28]] to <8 x i8>*
-; CHECK-PWR9-NEXT: [[WIDE_LOAD17:%.*]] = load <8 x i8>, <8 x i8>* [[TMP29]], align 16
-; CHECK-PWR9-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, i8* [[TMP24]], i64 24
-; CHECK-PWR9-NEXT: [[TMP31:%.*]] = bitcast i8* [[TMP30]] to <8 x i8>*
-; CHECK-PWR9-NEXT: [[WIDE_LOAD18:%.*]] = load <8 x i8>, <8 x i8>* [[TMP31]], align 8
-; CHECK-PWR9-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, i8* [[TMP24]], i64 32
-; CHECK-PWR9-NEXT: [[TMP33:%.*]] = bitcast i8* [[TMP32]] to <8 x i8>*
-; CHECK-PWR9-NEXT: [[WIDE_LOAD19:%.*]] = load <8 x i8>, <8 x i8>* [[TMP33]], align 16
-; CHECK-PWR9-NEXT: [[TMP34:%.*]] = getelementptr inbounds i8, i8* [[TMP24]], i64 40
-; CHECK-PWR9-NEXT: [[TMP35:%.*]] = bitcast i8* [[TMP34]] to <8 x i8>*
-; CHECK-PWR9-NEXT: [[WIDE_LOAD20:%.*]] = load <8 x i8>, <8 x i8>* [[TMP35]], align 8
-; CHECK-PWR9-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, i8* [[TMP24]], i64 48
-; CHECK-PWR9-NEXT: [[TMP37:%.*]] = bitcast i8* [[TMP36]] to <8 x i8>*
-; CHECK-PWR9-NEXT: [[WIDE_LOAD21:%.*]] = load <8 x i8>, <8 x i8>* [[TMP37]], align 16
-; CHECK-PWR9-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, i8* [[TMP24]], i64 56
-; CHECK-PWR9-NEXT: [[TMP39:%.*]] = bitcast i8* [[TMP38]] to <8 x i8>*
-; CHECK-PWR9-NEXT: [[WIDE_LOAD22:%.*]] = load <8 x i8>, <8 x i8>* [[TMP39]], align 8
-; CHECK-PWR9-NEXT: [[TMP40:%.*]] = zext <8 x i8> [[WIDE_LOAD15]] to <8 x i32>
-; CHECK-PWR9-NEXT: [[TMP41:%.*]] = zext <8 x i8> [[WIDE_LOAD16]] to <8 x i32>
-; CHECK-PWR9-NEXT: [[TMP42:%.*]] = zext <8 x i8> [[WIDE_LOAD17]] to <8 x i32>
-; CHECK-PWR9-NEXT: [[TMP43:%.*]] = zext <8 x i8> [[WIDE_LOAD18]] to <8 x i32>
-; CHECK-PWR9-NEXT: [[TMP44:%.*]] = zext <8 x i8> [[WIDE_LOAD19]] to <8 x i32>
-; CHECK-PWR9-NEXT: [[TMP45:%.*]] = zext <8 x i8> [[WIDE_LOAD20]] to <8 x i32>
-; CHECK-PWR9-NEXT: [[TMP46:%.*]] = zext <8 x i8> [[WIDE_LOAD21]] to <8 x i32>
-; CHECK-PWR9-NEXT: [[TMP47:%.*]] = zext <8 x i8> [[WIDE_LOAD22]] to <8 x i32>
-; CHECK-PWR9-NEXT: [[TMP48:%.*]] = sub nsw <8 x i32> [[TMP16]], [[TMP40]]
-; CHECK-PWR9-NEXT: [[TMP49:%.*]] = sub nsw <8 x i32> [[TMP17]], [[TMP41]]
-; CHECK-PWR9-NEXT: [[TMP50:%.*]] = sub nsw <8 x i32> [[TMP18]], [[TMP42]]
-; CHECK-PWR9-NEXT: [[TMP51:%.*]] = sub nsw <8 x i32> [[TMP19]], [[TMP43]]
-; CHECK-PWR9-NEXT: [[TMP52:%.*]] = sub nsw <8 x i32> [[TMP20]], [[TMP44]]
-; CHECK-PWR9-NEXT: [[TMP53:%.*]] = sub nsw <8 x i32> [[TMP21]], [[TMP45]]
-; CHECK-PWR9-NEXT: [[TMP54:%.*]] = sub nsw <8 x i32> [[TMP22]], [[TMP46]]
-; CHECK-PWR9-NEXT: [[TMP55:%.*]] = sub nsw <8 x i32> [[TMP23]], [[TMP47]]
-; CHECK-PWR9-NEXT: [[TMP56:%.*]] = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> [[TMP48]], i1 true)
-; CHECK-PWR9-NEXT: [[TMP57:%.*]] = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> [[TMP49]], i1 true)
-; CHECK-PWR9-NEXT: [[TMP58:%.*]] = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> [[TMP50]], i1 true)
-; CHECK-PWR9-NEXT: [[TMP59:%.*]] = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> [[TMP51]], i1 true)
-; CHECK-PWR9-NEXT: [[TMP60:%.*]] = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> [[TMP52]], i1 true)
-; CHECK-PWR9-NEXT: [[TMP61:%.*]] = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> [[TMP53]], i1 true)
-; CHECK-PWR9-NEXT: [[TMP62:%.*]] = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> [[TMP54]], i1 true)
-; CHECK-PWR9-NEXT: [[TMP63:%.*]] = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> [[TMP55]], i1 true)
-; CHECK-PWR9-NEXT: [[TMP64]] = add <8 x i32> [[TMP56]], [[VEC_PHI]]
-; CHECK-PWR9-NEXT: [[TMP65]] = add <8 x i32> [[TMP57]], [[VEC_PHI1]]
-; CHECK-PWR9-NEXT: [[TMP66]] = add <8 x i32> [[TMP58]], [[VEC_PHI2]]
-; CHECK-PWR9-NEXT: [[TMP67]] = add <8 x i32> [[TMP59]], [[VEC_PHI3]]
-; CHECK-PWR9-NEXT: [[TMP68]] = add <8 x i32> [[TMP60]], [[VEC_PHI4]]
-; CHECK-PWR9-NEXT: [[TMP69]] = add <8 x i32> [[TMP61]], [[VEC_PHI5]]
-; CHECK-PWR9-NEXT: [[TMP70]] = add <8 x i32> [[TMP62]], [[VEC_PHI6]]
-; CHECK-PWR9-NEXT: [[TMP71]] = add <8 x i32> [[TMP63]], [[VEC_PHI7]]
-; CHECK-PWR9-NEXT: [[INDEX_NEXT]] = add nuw nsw i64 [[INDEX]], 64
-; CHECK-PWR9-NEXT: [[TMP72:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-PWR9-NEXT: br i1 [[TMP72]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK-PWR9: for.cond.cleanup:
-; CHECK-PWR9-NEXT: [[BIN_RDX:%.*]] = add <8 x i32> [[TMP65]], [[TMP64]]
-; CHECK-PWR9-NEXT: [[BIN_RDX23:%.*]] = add <8 x i32> [[BIN_RDX]], [[TMP66]]
-; CHECK-PWR9-NEXT: [[BIN_RDX24:%.*]] = add <8 x i32> [[BIN_RDX23]], [[TMP67]]
-; CHECK-PWR9-NEXT: [[BIN_RDX25:%.*]] = add <8 x i32> [[BIN_RDX24]], [[TMP68]]
-; CHECK-PWR9-NEXT: [[BIN_RDX26:%.*]] = add <8 x i32> [[BIN_RDX25]], [[TMP69]]
-; CHECK-PWR9-NEXT: [[BIN_RDX27:%.*]] = add <8 x i32> [[BIN_RDX26]], [[TMP70]]
-; CHECK-PWR9-NEXT: [[BIN_RDX28:%.*]] = add <8 x i32> [[BIN_RDX27]], [[TMP71]]
-; CHECK-PWR9-NEXT: [[TMP73:%.*]] = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX28]])
-; CHECK-PWR9-NEXT: ret i32 [[TMP73]]
-;
+; CHECK-LABEL: foo
+; CHECK-PWR8: Executing best plan with VF=16, UF=4
+; CHECK-PWR9: Executing best plan with VF=8, UF=8
entry:
; For indvars.iv used in a computating chain only feeding into getelementptr or cmp,
; it will not have vector version and the vector register usage will not exceed the
; available vector register number.
-; CHECK-LABEL: @goo(
-; CHECK-NEXT: iter.check:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ITER_CHECK:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[ITER_CHECK]] ], [ [[TMP34:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[ITER_CHECK]] ], [ [[TMP35:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <16 x i32> [ zeroinitializer, [[ITER_CHECK]] ], [ [[TMP36:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <16 x i32> [ zeroinitializer, [[ITER_CHECK]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = or i64 [[INDEX]], 3
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 [[TMP0]]
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <16 x i8>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[TMP2]], align 1
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 16
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <16 x i8>*
-; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, <16 x i8>* [[TMP4]], align 1
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 32
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <16 x i8>*
-; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, <16 x i8>* [[TMP6]], align 1
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i64 48
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <16 x i8>*
-; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, <16 x i8>* [[TMP8]], align 1
-; CHECK-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
-; CHECK-NEXT: [[TMP11:%.*]] = zext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
-; CHECK-NEXT: [[TMP12:%.*]] = zext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
-; CHECK-NEXT: [[TMP13:%.*]] = or i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 [[TMP13]]
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <16 x i8>*
-; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <16 x i8>, <16 x i8>* [[TMP15]], align 2
-; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, i8* [[TMP14]], i64 16
-; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to <16 x i8>*
-; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, <16 x i8>* [[TMP17]], align 2
-; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, i8* [[TMP14]], i64 32
-; CHECK-NEXT: [[TMP19:%.*]] = bitcast i8* [[TMP18]] to <16 x i8>*
-; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, <16 x i8>* [[TMP19]], align 2
-; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, i8* [[TMP14]], i64 48
-; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8* [[TMP20]] to <16 x i8>*
-; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <16 x i8>, <16 x i8>* [[TMP21]], align 2
-; CHECK-NEXT: [[TMP22:%.*]] = zext <16 x i8> [[WIDE_LOAD7]] to <16 x i32>
-; CHECK-NEXT: [[TMP23:%.*]] = zext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
-; CHECK-NEXT: [[TMP24:%.*]] = zext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
-; CHECK-NEXT: [[TMP25:%.*]] = zext <16 x i8> [[WIDE_LOAD10]] to <16 x i32>
-; CHECK-NEXT: [[TMP26:%.*]] = sub nsw <16 x i32> [[TMP9]], [[TMP22]]
-; CHECK-NEXT: [[TMP27:%.*]] = sub nsw <16 x i32> [[TMP10]], [[TMP23]]
-; CHECK-NEXT: [[TMP28:%.*]] = sub nsw <16 x i32> [[TMP11]], [[TMP24]]
-; CHECK-NEXT: [[TMP29:%.*]] = sub nsw <16 x i32> [[TMP12]], [[TMP25]]
-; CHECK-NEXT: [[TMP30:%.*]] = tail call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[TMP26]], i1 true)
-; CHECK-NEXT: [[TMP31:%.*]] = tail call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[TMP27]], i1 true)
-; CHECK-NEXT: [[TMP32:%.*]] = tail call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[TMP28]], i1 true)
-; CHECK-NEXT: [[TMP33:%.*]] = tail call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[TMP29]], i1 true)
-; CHECK-NEXT: [[TMP34]] = add <16 x i32> [[TMP30]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP35]] = add <16 x i32> [[TMP31]], [[VEC_PHI1]]
-; CHECK-NEXT: [[TMP36]] = add <16 x i32> [[TMP32]], [[VEC_PHI2]]
-; CHECK-NEXT: [[TMP37]] = add <16 x i32> [[TMP33]], [[VEC_PHI3]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw nsw i64 [[INDEX]], 64
-; CHECK-NEXT: [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT: br i1 [[TMP38]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP35]], [[TMP34]]
-; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <16 x i32> [[BIN_RDX]], [[TMP36]]
-; CHECK-NEXT: [[BIN_RDX12:%.*]] = add <16 x i32> [[BIN_RDX11]], [[TMP37]]
-; CHECK-NEXT: [[TMP39:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX12]])
-; CHECK-NEXT: ret i32 [[TMP39]]
-;
+; CHECK-LABEL: goo
+
+; CHECK: Executing best plan with VF=16, UF=4
entry:
br label %for.body
}
define i64 @bar(i64* nocapture %a) {
-; CHECK-LABEL: @bar(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[ENTRY]] ], [ [[TMP36:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI12:%.*]] = phi <2 x i64> [ zeroinitializer, [[ENTRY]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI13:%.*]] = phi <2 x i64> [ zeroinitializer, [[ENTRY]] ], [ [[TMP38:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI14:%.*]] = phi <2 x i64> [ zeroinitializer, [[ENTRY]] ], [ [[TMP39:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI15:%.*]] = phi <2 x i64> [ zeroinitializer, [[ENTRY]] ], [ [[TMP40:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI16:%.*]] = phi <2 x i64> [ zeroinitializer, [[ENTRY]] ], [ [[TMP41:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI17:%.*]] = phi <2 x i64> [ zeroinitializer, [[ENTRY]] ], [ [[TMP42:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI18:%.*]] = phi <2 x i64> [ zeroinitializer, [[ENTRY]] ], [ [[TMP43:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI19:%.*]] = phi <2 x i64> [ zeroinitializer, [[ENTRY]] ], [ [[TMP44:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI20:%.*]] = phi <2 x i64> [ zeroinitializer, [[ENTRY]] ], [ [[TMP45:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI21:%.*]] = phi <2 x i64> [ zeroinitializer, [[ENTRY]] ], [ [[TMP46:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI22:%.*]] = phi <2 x i64> [ zeroinitializer, [[ENTRY]] ], [ [[TMP47:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
-; CHECK-NEXT: [[STEP_ADD1:%.*]] = add <2 x i64> [[VEC_IND]], <i64 4, i64 4>
-; CHECK-NEXT: [[STEP_ADD2:%.*]] = add <2 x i64> [[VEC_IND]], <i64 6, i64 6>
-; CHECK-NEXT: [[STEP_ADD3:%.*]] = add <2 x i64> [[VEC_IND]], <i64 8, i64 8>
-; CHECK-NEXT: [[STEP_ADD4:%.*]] = add <2 x i64> [[VEC_IND]], <i64 10, i64 10>
-; CHECK-NEXT: [[STEP_ADD5:%.*]] = add <2 x i64> [[VEC_IND]], <i64 12, i64 12>
-; CHECK-NEXT: [[STEP_ADD6:%.*]] = add <2 x i64> [[VEC_IND]], <i64 14, i64 14>
-; CHECK-NEXT: [[STEP_ADD7:%.*]] = add <2 x i64> [[VEC_IND]], <i64 16, i64 16>
-; CHECK-NEXT: [[STEP_ADD8:%.*]] = add <2 x i64> [[VEC_IND]], <i64 18, i64 18>
-; CHECK-NEXT: [[STEP_ADD9:%.*]] = add <2 x i64> [[VEC_IND]], <i64 20, i64 20>
-; CHECK-NEXT: [[STEP_ADD10:%.*]] = add <2 x i64> [[VEC_IND]], <i64 22, i64 22>
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[TMP0]] to <2 x i64>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 2
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[TMP2]] to <2 x i64>*
-; CHECK-NEXT: [[WIDE_LOAD23:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 8
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 4
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i64* [[TMP4]] to <2 x i64>*
-; CHECK-NEXT: [[WIDE_LOAD24:%.*]] = load <2 x i64>, <2 x i64>* [[TMP5]], align 8
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 6
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast i64* [[TMP6]] to <2 x i64>*
-; CHECK-NEXT: [[WIDE_LOAD25:%.*]] = load <2 x i64>, <2 x i64>* [[TMP7]], align 8
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 8
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64* [[TMP8]] to <2 x i64>*
-; CHECK-NEXT: [[WIDE_LOAD26:%.*]] = load <2 x i64>, <2 x i64>* [[TMP9]], align 8
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 10
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64* [[TMP10]] to <2 x i64>*
-; CHECK-NEXT: [[WIDE_LOAD27:%.*]] = load <2 x i64>, <2 x i64>* [[TMP11]], align 8
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 12
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast i64* [[TMP12]] to <2 x i64>*
-; CHECK-NEXT: [[WIDE_LOAD28:%.*]] = load <2 x i64>, <2 x i64>* [[TMP13]], align 8
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 14
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast i64* [[TMP14]] to <2 x i64>*
-; CHECK-NEXT: [[WIDE_LOAD29:%.*]] = load <2 x i64>, <2 x i64>* [[TMP15]], align 8
-; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 16
-; CHECK-NEXT: [[TMP17:%.*]] = bitcast i64* [[TMP16]] to <2 x i64>*
-; CHECK-NEXT: [[WIDE_LOAD30:%.*]] = load <2 x i64>, <2 x i64>* [[TMP17]], align 8
-; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 18
-; CHECK-NEXT: [[TMP19:%.*]] = bitcast i64* [[TMP18]] to <2 x i64>*
-; CHECK-NEXT: [[WIDE_LOAD31:%.*]] = load <2 x i64>, <2 x i64>* [[TMP19]], align 8
-; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 20
-; CHECK-NEXT: [[TMP21:%.*]] = bitcast i64* [[TMP20]] to <2 x i64>*
-; CHECK-NEXT: [[WIDE_LOAD32:%.*]] = load <2 x i64>, <2 x i64>* [[TMP21]], align 8
-; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 22
-; CHECK-NEXT: [[TMP23:%.*]] = bitcast i64* [[TMP22]] to <2 x i64>*
-; CHECK-NEXT: [[WIDE_LOAD33:%.*]] = load <2 x i64>, <2 x i64>* [[TMP23]], align 8
-; CHECK-NEXT: [[TMP24:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], [[VEC_IND]]
-; CHECK-NEXT: [[TMP25:%.*]] = add nsw <2 x i64> [[STEP_ADD]], [[WIDE_LOAD23]]
-; CHECK-NEXT: [[TMP26:%.*]] = add nsw <2 x i64> [[STEP_ADD1]], [[WIDE_LOAD24]]
-; CHECK-NEXT: [[TMP27:%.*]] = add nsw <2 x i64> [[STEP_ADD2]], [[WIDE_LOAD25]]
-; CHECK-NEXT: [[TMP28:%.*]] = add nsw <2 x i64> [[STEP_ADD3]], [[WIDE_LOAD26]]
-; CHECK-NEXT: [[TMP29:%.*]] = add nsw <2 x i64> [[STEP_ADD4]], [[WIDE_LOAD27]]
-; CHECK-NEXT: [[TMP30:%.*]] = add nsw <2 x i64> [[STEP_ADD5]], [[WIDE_LOAD28]]
-; CHECK-NEXT: [[TMP31:%.*]] = add nsw <2 x i64> [[STEP_ADD6]], [[WIDE_LOAD29]]
-; CHECK-NEXT: [[TMP32:%.*]] = add nsw <2 x i64> [[STEP_ADD7]], [[WIDE_LOAD30]]
-; CHECK-NEXT: [[TMP33:%.*]] = add nsw <2 x i64> [[STEP_ADD8]], [[WIDE_LOAD31]]
-; CHECK-NEXT: [[TMP34:%.*]] = add nsw <2 x i64> [[STEP_ADD9]], [[WIDE_LOAD32]]
-; CHECK-NEXT: [[TMP35:%.*]] = add nsw <2 x i64> [[STEP_ADD10]], [[WIDE_LOAD33]]
-; CHECK-NEXT: store <2 x i64> [[TMP24]], <2 x i64>* [[TMP1]], align 8
-; CHECK-NEXT: store <2 x i64> [[TMP25]], <2 x i64>* [[TMP3]], align 8
-; CHECK-NEXT: store <2 x i64> [[TMP26]], <2 x i64>* [[TMP5]], align 8
-; CHECK-NEXT: store <2 x i64> [[TMP27]], <2 x i64>* [[TMP7]], align 8
-; CHECK-NEXT: store <2 x i64> [[TMP28]], <2 x i64>* [[TMP9]], align 8
-; CHECK-NEXT: store <2 x i64> [[TMP29]], <2 x i64>* [[TMP11]], align 8
-; CHECK-NEXT: store <2 x i64> [[TMP30]], <2 x i64>* [[TMP13]], align 8
-; CHECK-NEXT: store <2 x i64> [[TMP31]], <2 x i64>* [[TMP15]], align 8
-; CHECK-NEXT: store <2 x i64> [[TMP32]], <2 x i64>* [[TMP17]], align 8
-; CHECK-NEXT: store <2 x i64> [[TMP33]], <2 x i64>* [[TMP19]], align 8
-; CHECK-NEXT: store <2 x i64> [[TMP34]], <2 x i64>* [[TMP21]], align 8
-; CHECK-NEXT: store <2 x i64> [[TMP35]], <2 x i64>* [[TMP23]], align 8
-; CHECK-NEXT: [[TMP36]] = add <2 x i64> [[TMP24]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP37]] = add <2 x i64> [[TMP25]], [[VEC_PHI12]]
-; CHECK-NEXT: [[TMP38]] = add <2 x i64> [[TMP26]], [[VEC_PHI13]]
-; CHECK-NEXT: [[TMP39]] = add <2 x i64> [[TMP27]], [[VEC_PHI14]]
-; CHECK-NEXT: [[TMP40]] = add <2 x i64> [[TMP28]], [[VEC_PHI15]]
-; CHECK-NEXT: [[TMP41]] = add <2 x i64> [[TMP29]], [[VEC_PHI16]]
-; CHECK-NEXT: [[TMP42]] = add <2 x i64> [[TMP30]], [[VEC_PHI17]]
-; CHECK-NEXT: [[TMP43]] = add <2 x i64> [[TMP31]], [[VEC_PHI18]]
-; CHECK-NEXT: [[TMP44]] = add <2 x i64> [[TMP32]], [[VEC_PHI19]]
-; CHECK-NEXT: [[TMP45]] = add <2 x i64> [[TMP33]], [[VEC_PHI20]]
-; CHECK-NEXT: [[TMP46]] = add <2 x i64> [[TMP34]], [[VEC_PHI21]]
-; CHECK-NEXT: [[TMP47]] = add <2 x i64> [[TMP35]], [[VEC_PHI22]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw nsw i64 [[INDEX]], 24
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 24, i64 24>
-; CHECK-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1008
-; CHECK-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[TMP37]], [[TMP36]]
-; CHECK-NEXT: [[BIN_RDX34:%.*]] = add <2 x i64> [[BIN_RDX]], [[TMP38]]
-; CHECK-NEXT: [[BIN_RDX35:%.*]] = add <2 x i64> [[BIN_RDX34]], [[TMP39]]
-; CHECK-NEXT: [[BIN_RDX36:%.*]] = add <2 x i64> [[BIN_RDX35]], [[TMP40]]
-; CHECK-NEXT: [[BIN_RDX37:%.*]] = add <2 x i64> [[BIN_RDX36]], [[TMP41]]
-; CHECK-NEXT: [[BIN_RDX38:%.*]] = add <2 x i64> [[BIN_RDX37]], [[TMP42]]
-; CHECK-NEXT: [[BIN_RDX39:%.*]] = add <2 x i64> [[BIN_RDX38]], [[TMP43]]
-; CHECK-NEXT: [[BIN_RDX40:%.*]] = add <2 x i64> [[BIN_RDX39]], [[TMP44]]
-; CHECK-NEXT: [[BIN_RDX41:%.*]] = add <2 x i64> [[BIN_RDX40]], [[TMP45]]
-; CHECK-NEXT: [[BIN_RDX42:%.*]] = add <2 x i64> [[BIN_RDX41]], [[TMP46]]
-; CHECK-NEXT: [[BIN_RDX43:%.*]] = add <2 x i64> [[BIN_RDX42]], [[TMP47]]
-; CHECK-NEXT: [[TMP49:%.*]] = tail call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX43]])
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 1008
-; CHECK-NEXT: [[TMP50:%.*]] = load i64, i64* [[ARRAYIDX]], align 8
-; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP50]], 1008
-; CHECK-NEXT: store i64 [[ADD]], i64* [[ARRAYIDX]], align 8
-; CHECK-NEXT: [[ADD2:%.*]] = add nsw i64 [[ADD]], [[TMP49]]
-; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 1009
-; CHECK-NEXT: [[TMP51:%.*]] = load i64, i64* [[ARRAYIDX_1]], align 8
-; CHECK-NEXT: [[ADD_1:%.*]] = add nsw i64 [[TMP51]], 1009
-; CHECK-NEXT: store i64 [[ADD_1]], i64* [[ARRAYIDX_1]], align 8
-; CHECK-NEXT: [[ADD2_1:%.*]] = add nsw i64 [[ADD_1]], [[ADD2]]
-; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 1010
-; CHECK-NEXT: [[TMP52:%.*]] = load i64, i64* [[ARRAYIDX_2]], align 8
-; CHECK-NEXT: [[ADD_2:%.*]] = add nsw i64 [[TMP52]], 1010
-; CHECK-NEXT: store i64 [[ADD_2]], i64* [[ARRAYIDX_2]], align 8
-; CHECK-NEXT: [[ADD2_2:%.*]] = add nsw i64 [[ADD_2]], [[ADD2_1]]
-; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 1011
-; CHECK-NEXT: [[TMP53:%.*]] = load i64, i64* [[ARRAYIDX_3]], align 8
-; CHECK-NEXT: [[ADD_3:%.*]] = add nsw i64 [[TMP53]], 1011
-; CHECK-NEXT: store i64 [[ADD_3]], i64* [[ARRAYIDX_3]], align 8
-; CHECK-NEXT: [[ADD2_3:%.*]] = add nsw i64 [[ADD_3]], [[ADD2_2]]
-; CHECK-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 1012
-; CHECK-NEXT: [[TMP54:%.*]] = load i64, i64* [[ARRAYIDX_4]], align 8
-; CHECK-NEXT: [[ADD_4:%.*]] = add nsw i64 [[TMP54]], 1012
-; CHECK-NEXT: store i64 [[ADD_4]], i64* [[ARRAYIDX_4]], align 8
-; CHECK-NEXT: [[ADD2_4:%.*]] = add nsw i64 [[ADD_4]], [[ADD2_3]]
-; CHECK-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 1013
-; CHECK-NEXT: [[TMP55:%.*]] = load i64, i64* [[ARRAYIDX_5]], align 8
-; CHECK-NEXT: [[ADD_5:%.*]] = add nsw i64 [[TMP55]], 1013
-; CHECK-NEXT: store i64 [[ADD_5]], i64* [[ARRAYIDX_5]], align 8
-; CHECK-NEXT: [[ADD2_5:%.*]] = add nsw i64 [[ADD_5]], [[ADD2_4]]
-; CHECK-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 1014
-; CHECK-NEXT: [[TMP56:%.*]] = load i64, i64* [[ARRAYIDX_6]], align 8
-; CHECK-NEXT: [[ADD_6:%.*]] = add nsw i64 [[TMP56]], 1014
-; CHECK-NEXT: store i64 [[ADD_6]], i64* [[ARRAYIDX_6]], align 8
-; CHECK-NEXT: [[ADD2_6:%.*]] = add nsw i64 [[ADD_6]], [[ADD2_5]]
-; CHECK-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 1015
-; CHECK-NEXT: [[TMP57:%.*]] = load i64, i64* [[ARRAYIDX_7]], align 8
-; CHECK-NEXT: [[ADD_7:%.*]] = add nsw i64 [[TMP57]], 1015
-; CHECK-NEXT: store i64 [[ADD_7]], i64* [[ARRAYIDX_7]], align 8
-; CHECK-NEXT: [[ADD2_7:%.*]] = add nsw i64 [[ADD_7]], [[ADD2_6]]
-; CHECK-NEXT: [[ARRAYIDX_8:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 1016
-; CHECK-NEXT: [[TMP58:%.*]] = load i64, i64* [[ARRAYIDX_8]], align 8
-; CHECK-NEXT: [[ADD_8:%.*]] = add nsw i64 [[TMP58]], 1016
-; CHECK-NEXT: store i64 [[ADD_8]], i64* [[ARRAYIDX_8]], align 8
-; CHECK-NEXT: [[ADD2_8:%.*]] = add nsw i64 [[ADD_8]], [[ADD2_7]]
-; CHECK-NEXT: [[ARRAYIDX_9:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 1017
-; CHECK-NEXT: [[TMP59:%.*]] = load i64, i64* [[ARRAYIDX_9]], align 8
-; CHECK-NEXT: [[ADD_9:%.*]] = add nsw i64 [[TMP59]], 1017
-; CHECK-NEXT: store i64 [[ADD_9]], i64* [[ARRAYIDX_9]], align 8
-; CHECK-NEXT: [[ADD2_9:%.*]] = add nsw i64 [[ADD_9]], [[ADD2_8]]
-; CHECK-NEXT: [[ARRAYIDX_10:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 1018
-; CHECK-NEXT: [[TMP60:%.*]] = load i64, i64* [[ARRAYIDX_10]], align 8
-; CHECK-NEXT: [[ADD_10:%.*]] = add nsw i64 [[TMP60]], 1018
-; CHECK-NEXT: store i64 [[ADD_10]], i64* [[ARRAYIDX_10]], align 8
-; CHECK-NEXT: [[ADD2_10:%.*]] = add nsw i64 [[ADD_10]], [[ADD2_9]]
-; CHECK-NEXT: [[ARRAYIDX_11:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 1019
-; CHECK-NEXT: [[TMP61:%.*]] = load i64, i64* [[ARRAYIDX_11]], align 8
-; CHECK-NEXT: [[ADD_11:%.*]] = add nsw i64 [[TMP61]], 1019
-; CHECK-NEXT: store i64 [[ADD_11]], i64* [[ARRAYIDX_11]], align 8
-; CHECK-NEXT: [[ADD2_11:%.*]] = add nsw i64 [[ADD_11]], [[ADD2_10]]
-; CHECK-NEXT: [[ARRAYIDX_12:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 1020
-; CHECK-NEXT: [[TMP62:%.*]] = load i64, i64* [[ARRAYIDX_12]], align 8
-; CHECK-NEXT: [[ADD_12:%.*]] = add nsw i64 [[TMP62]], 1020
-; CHECK-NEXT: store i64 [[ADD_12]], i64* [[ARRAYIDX_12]], align 8
-; CHECK-NEXT: [[ADD2_12:%.*]] = add nsw i64 [[ADD_12]], [[ADD2_11]]
-; CHECK-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 1021
-; CHECK-NEXT: [[TMP63:%.*]] = load i64, i64* [[ARRAYIDX_13]], align 8
-; CHECK-NEXT: [[ADD_13:%.*]] = add nsw i64 [[TMP63]], 1021
-; CHECK-NEXT: store i64 [[ADD_13]], i64* [[ARRAYIDX_13]], align 8
-; CHECK-NEXT: [[ADD2_13:%.*]] = add nsw i64 [[ADD_13]], [[ADD2_12]]
-; CHECK-NEXT: [[ARRAYIDX_14:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 1022
-; CHECK-NEXT: [[TMP64:%.*]] = load i64, i64* [[ARRAYIDX_14]], align 8
-; CHECK-NEXT: [[ADD_14:%.*]] = add nsw i64 [[TMP64]], 1022
-; CHECK-NEXT: store i64 [[ADD_14]], i64* [[ARRAYIDX_14]], align 8
-; CHECK-NEXT: [[ADD2_14:%.*]] = add nsw i64 [[ADD_14]], [[ADD2_13]]
-; CHECK-NEXT: [[ARRAYIDX_15:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 1023
-; CHECK-NEXT: [[TMP65:%.*]] = load i64, i64* [[ARRAYIDX_15]], align 8
-; CHECK-NEXT: [[ADD_15:%.*]] = add nsw i64 [[TMP65]], 1023
-; CHECK-NEXT: store i64 [[ADD_15]], i64* [[ARRAYIDX_15]], align 8
-; CHECK-NEXT: [[ADD2_15:%.*]] = add nsw i64 [[ADD_15]], [[ADD2_14]]
-; CHECK-NEXT: ret i64 [[ADD2_15]]
-;
+; CHECK-LABEL: bar
+; CHECK: Executing best plan with VF=2, UF=12
entry:
br label %for.body
@c = external global [0 x i32], align 4
define void @hoo(i32 %n) {
-; CHECK-LABEL: @hoo(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[INDUCTION1:%.*]] = or i64 [[INDEX]], 1
-; CHECK-NEXT: [[INDUCTION2:%.*]] = or i64 [[INDEX]], 2
-; CHECK-NEXT: [[INDUCTION3:%.*]] = or i64 [[INDEX]], 3
-; CHECK-NEXT: [[INDUCTION4:%.*]] = add nuw nsw i64 [[INDEX]], 4
-; CHECK-NEXT: [[INDUCTION5:%.*]] = add nuw nsw i64 [[INDEX]], 5
-; CHECK-NEXT: [[INDUCTION6:%.*]] = add nuw nsw i64 [[INDEX]], 6
-; CHECK-NEXT: [[INDUCTION7:%.*]] = add nuw nsw i64 [[INDEX]], 7
-; CHECK-NEXT: [[INDUCTION8:%.*]] = add nuw nsw i64 [[INDEX]], 8
-; CHECK-NEXT: [[INDUCTION9:%.*]] = add nuw nsw i64 [[INDEX]], 9
-; CHECK-NEXT: [[INDUCTION10:%.*]] = add nuw nsw i64 [[INDEX]], 10
-; CHECK-NEXT: [[INDUCTION11:%.*]] = add nuw nsw i64 [[INDEX]], 11
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* @d, i64 0, i64 [[INDEX]]
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* @d, i64 0, i64 [[INDUCTION1]]
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* @d, i64 0, i64 [[INDUCTION2]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* @d, i64 0, i64 [[INDUCTION3]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* @d, i64 0, i64 [[INDUCTION4]]
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* @d, i64 0, i64 [[INDUCTION5]]
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* @d, i64 0, i64 [[INDUCTION6]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* @d, i64 0, i64 [[INDUCTION7]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* @d, i64 0, i64 [[INDUCTION8]]
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* @d, i64 0, i64 [[INDUCTION9]]
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* @d, i64 0, i64 [[INDUCTION10]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* @d, i64 0, i64 [[INDUCTION11]]
-; CHECK-NEXT: [[TMP12:%.*]] = load i64, i64* [[TMP0]], align 8
-; CHECK-NEXT: [[TMP13:%.*]] = load i64, i64* [[TMP1]], align 8
-; CHECK-NEXT: [[TMP14:%.*]] = load i64, i64* [[TMP2]], align 8
-; CHECK-NEXT: [[TMP15:%.*]] = load i64, i64* [[TMP3]], align 8
-; CHECK-NEXT: [[TMP16:%.*]] = load i64, i64* [[TMP4]], align 8
-; CHECK-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP5]], align 8
-; CHECK-NEXT: [[TMP18:%.*]] = load i64, i64* [[TMP6]], align 8
-; CHECK-NEXT: [[TMP19:%.*]] = load i64, i64* [[TMP7]], align 8
-; CHECK-NEXT: [[TMP20:%.*]] = load i64, i64* [[TMP8]], align 8
-; CHECK-NEXT: [[TMP21:%.*]] = load i64, i64* [[TMP9]], align 8
-; CHECK-NEXT: [[TMP22:%.*]] = load i64, i64* [[TMP10]], align 8
-; CHECK-NEXT: [[TMP23:%.*]] = load i64, i64* [[TMP11]], align 8
-; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @e, i64 0, i64 [[TMP12]]
-; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @e, i64 0, i64 [[TMP13]]
-; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @e, i64 0, i64 [[TMP14]]
-; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @e, i64 0, i64 [[TMP15]]
-; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @e, i64 0, i64 [[TMP16]]
-; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @e, i64 0, i64 [[TMP17]]
-; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @e, i64 0, i64 [[TMP18]]
-; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @e, i64 0, i64 [[TMP19]]
-; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @e, i64 0, i64 [[TMP20]]
-; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @e, i64 0, i64 [[TMP21]]
-; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @e, i64 0, i64 [[TMP22]]
-; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @e, i64 0, i64 [[TMP23]]
-; CHECK-NEXT: [[TMP36:%.*]] = load i32, i32* [[TMP24]], align 4
-; CHECK-NEXT: [[TMP37:%.*]] = load i32, i32* [[TMP25]], align 4
-; CHECK-NEXT: [[TMP38:%.*]] = load i32, i32* [[TMP26]], align 4
-; CHECK-NEXT: [[TMP39:%.*]] = load i32, i32* [[TMP27]], align 4
-; CHECK-NEXT: [[TMP40:%.*]] = load i32, i32* [[TMP28]], align 4
-; CHECK-NEXT: [[TMP41:%.*]] = load i32, i32* [[TMP29]], align 4
-; CHECK-NEXT: [[TMP42:%.*]] = load i32, i32* [[TMP30]], align 4
-; CHECK-NEXT: [[TMP43:%.*]] = load i32, i32* [[TMP31]], align 4
-; CHECK-NEXT: [[TMP44:%.*]] = load i32, i32* [[TMP32]], align 4
-; CHECK-NEXT: [[TMP45:%.*]] = load i32, i32* [[TMP33]], align 4
-; CHECK-NEXT: [[TMP46:%.*]] = load i32, i32* [[TMP34]], align 4
-; CHECK-NEXT: [[TMP47:%.*]] = load i32, i32* [[TMP35]], align 4
-; CHECK-NEXT: [[TMP48:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @c, i64 0, i64 [[INDEX]]
-; CHECK-NEXT: [[TMP49:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @c, i64 0, i64 [[INDUCTION1]]
-; CHECK-NEXT: [[TMP50:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @c, i64 0, i64 [[INDUCTION2]]
-; CHECK-NEXT: [[TMP51:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @c, i64 0, i64 [[INDUCTION3]]
-; CHECK-NEXT: [[TMP52:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @c, i64 0, i64 [[INDUCTION4]]
-; CHECK-NEXT: [[TMP53:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @c, i64 0, i64 [[INDUCTION5]]
-; CHECK-NEXT: [[TMP54:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @c, i64 0, i64 [[INDUCTION6]]
-; CHECK-NEXT: [[TMP55:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @c, i64 0, i64 [[INDUCTION7]]
-; CHECK-NEXT: [[TMP56:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @c, i64 0, i64 [[INDUCTION8]]
-; CHECK-NEXT: [[TMP57:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @c, i64 0, i64 [[INDUCTION9]]
-; CHECK-NEXT: [[TMP58:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @c, i64 0, i64 [[INDUCTION10]]
-; CHECK-NEXT: [[TMP59:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @c, i64 0, i64 [[INDUCTION11]]
-; CHECK-NEXT: store i32 [[TMP36]], i32* [[TMP48]], align 4
-; CHECK-NEXT: store i32 [[TMP37]], i32* [[TMP49]], align 4
-; CHECK-NEXT: store i32 [[TMP38]], i32* [[TMP50]], align 4
-; CHECK-NEXT: store i32 [[TMP39]], i32* [[TMP51]], align 4
-; CHECK-NEXT: store i32 [[TMP40]], i32* [[TMP52]], align 4
-; CHECK-NEXT: store i32 [[TMP41]], i32* [[TMP53]], align 4
-; CHECK-NEXT: store i32 [[TMP42]], i32* [[TMP54]], align 4
-; CHECK-NEXT: store i32 [[TMP43]], i32* [[TMP55]], align 4
-; CHECK-NEXT: store i32 [[TMP44]], i32* [[TMP56]], align 4
-; CHECK-NEXT: store i32 [[TMP45]], i32* [[TMP57]], align 4
-; CHECK-NEXT: store i32 [[TMP46]], i32* [[TMP58]], align 4
-; CHECK-NEXT: store i32 [[TMP47]], i32* [[TMP59]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw nsw i64 [[INDEX]], 12
-; CHECK-NEXT: [[TMP60:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9996
-; CHECK-NEXT: br i1 [[TMP60]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK: for.body.preheader:
-; CHECK-NEXT: [[TMP:%.*]] = load i64, i64* getelementptr inbounds ([0 x i64], [0 x i64]* @d, i64 0, i64 9996), align 8
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @e, i64 0, i64 [[TMP]]
-; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4
-; CHECK-NEXT: store i32 [[TMP1]], i32* getelementptr inbounds ([0 x i32], [0 x i32]* @c, i64 0, i64 9996), align 4
-; CHECK-NEXT: [[TMP_1:%.*]] = load i64, i64* getelementptr inbounds ([0 x i64], [0 x i64]* @d, i64 0, i64 9997), align 8
-; CHECK-NEXT: [[ARRAYIDX1_1:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @e, i64 0, i64 [[TMP_1]]
-; CHECK-NEXT: [[TMP1_1:%.*]] = load i32, i32* [[ARRAYIDX1_1]], align 4
-; CHECK-NEXT: store i32 [[TMP1_1]], i32* getelementptr inbounds ([0 x i32], [0 x i32]* @c, i64 0, i64 9997), align 4
-; CHECK-NEXT: [[TMP_2:%.*]] = load i64, i64* getelementptr inbounds ([0 x i64], [0 x i64]* @d, i64 0, i64 9998), align 8
-; CHECK-NEXT: [[ARRAYIDX1_2:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @e, i64 0, i64 [[TMP_2]]
-; CHECK-NEXT: [[TMP1_2:%.*]] = load i32, i32* [[ARRAYIDX1_2]], align 4
-; CHECK-NEXT: store i32 [[TMP1_2]], i32* getelementptr inbounds ([0 x i32], [0 x i32]* @c, i64 0, i64 9998), align 4
-; CHECK-NEXT: [[TMP_3:%.*]] = load i64, i64* getelementptr inbounds ([0 x i64], [0 x i64]* @d, i64 0, i64 9999), align 8
-; CHECK-NEXT: [[ARRAYIDX1_3:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @e, i64 0, i64 [[TMP_3]]
-; CHECK-NEXT: [[TMP1_3:%.*]] = load i32, i32* [[ARRAYIDX1_3]], align 4
-; CHECK-NEXT: store i32 [[TMP1_3]], i32* getelementptr inbounds ([0 x i32], [0 x i32]* @c, i64 0, i64 9999), align 4
-; CHECK-NEXT: ret void
-;
+; CHECK-LABEL: hoo
+; CHECK: Executing best plan with VF=1, UF=12
entry:
br label %for.body
}
define float @float_(float* nocapture readonly %a, float* nocapture readonly %b, i32 %n) {
-; CHECK-LABEL: @float_(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; CHECK-NEXT: br i1 [[CMP]], label [[PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK: preheader:
-; CHECK-NEXT: [[T033:%.*]] = zext i32 [[N]] to i64
-; CHECK-NEXT: [[TMP0:%.*]] = add nsw i64 [[T033]], -1
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 352
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_PREHEADER:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 5
-; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; CHECK-NEXT: [[N_MOD_VF_LHS_TRUNC:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT: [[N_MOD_VF34:%.*]] = urem i32 [[N_MOD_VF_LHS_TRUNC]], 12
-; CHECK-NEXT: [[N_MOD_VF_ZEXT:%.*]] = zext i32 [[N_MOD_VF34]] to i64
-; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 [[TMP2]], [[N_MOD_VF_ZEXT]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP63:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP64:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP65:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP66:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP67:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI5:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP68:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP69:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI7:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP70:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI8:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP71:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI9:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP72:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI10:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP73:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI11:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP74:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 5
-; CHECK-NEXT: [[INDUCTION12:%.*]] = or i64 [[OFFSET_IDX]], 32
-; CHECK-NEXT: [[INDUCTION13:%.*]] = or i64 [[OFFSET_IDX]], 64
-; CHECK-NEXT: [[INDUCTION14:%.*]] = or i64 [[OFFSET_IDX]], 96
-; CHECK-NEXT: [[INDUCTION15:%.*]] = add i64 [[OFFSET_IDX]], 128
-; CHECK-NEXT: [[INDUCTION16:%.*]] = add i64 [[OFFSET_IDX]], 160
-; CHECK-NEXT: [[INDUCTION17:%.*]] = add i64 [[OFFSET_IDX]], 192
-; CHECK-NEXT: [[INDUCTION18:%.*]] = add i64 [[OFFSET_IDX]], 224
-; CHECK-NEXT: [[INDUCTION19:%.*]] = add i64 [[OFFSET_IDX]], 256
-; CHECK-NEXT: [[INDUCTION20:%.*]] = add i64 [[OFFSET_IDX]], 288
-; CHECK-NEXT: [[INDUCTION21:%.*]] = add i64 [[OFFSET_IDX]], 320
-; CHECK-NEXT: [[INDUCTION22:%.*]] = add i64 [[OFFSET_IDX]], 352
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION12]]
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION13]]
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION14]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION15]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION16]]
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION17]]
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION18]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION19]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION20]]
-; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION21]]
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDUCTION22]]
-; CHECK-NEXT: [[TMP15:%.*]] = load float, float* [[TMP3]], align 4
-; CHECK-NEXT: [[TMP16:%.*]] = load float, float* [[TMP4]], align 4
-; CHECK-NEXT: [[TMP17:%.*]] = load float, float* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP18:%.*]] = load float, float* [[TMP6]], align 4
-; CHECK-NEXT: [[TMP19:%.*]] = load float, float* [[TMP7]], align 4
-; CHECK-NEXT: [[TMP20:%.*]] = load float, float* [[TMP8]], align 4
-; CHECK-NEXT: [[TMP21:%.*]] = load float, float* [[TMP9]], align 4
-; CHECK-NEXT: [[TMP22:%.*]] = load float, float* [[TMP10]], align 4
-; CHECK-NEXT: [[TMP23:%.*]] = load float, float* [[TMP11]], align 4
-; CHECK-NEXT: [[TMP24:%.*]] = load float, float* [[TMP12]], align 4
-; CHECK-NEXT: [[TMP25:%.*]] = load float, float* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP26:%.*]] = load float, float* [[TMP14]], align 4
-; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDUCTION12]]
-; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDUCTION13]]
-; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDUCTION14]]
-; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDUCTION15]]
-; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDUCTION16]]
-; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDUCTION17]]
-; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDUCTION18]]
-; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDUCTION19]]
-; CHECK-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDUCTION20]]
-; CHECK-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDUCTION21]]
-; CHECK-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDUCTION22]]
-; CHECK-NEXT: [[TMP39:%.*]] = load float, float* [[TMP27]], align 4
-; CHECK-NEXT: [[TMP40:%.*]] = load float, float* [[TMP28]], align 4
-; CHECK-NEXT: [[TMP41:%.*]] = load float, float* [[TMP29]], align 4
-; CHECK-NEXT: [[TMP42:%.*]] = load float, float* [[TMP30]], align 4
-; CHECK-NEXT: [[TMP43:%.*]] = load float, float* [[TMP31]], align 4
-; CHECK-NEXT: [[TMP44:%.*]] = load float, float* [[TMP32]], align 4
-; CHECK-NEXT: [[TMP45:%.*]] = load float, float* [[TMP33]], align 4
-; CHECK-NEXT: [[TMP46:%.*]] = load float, float* [[TMP34]], align 4
-; CHECK-NEXT: [[TMP47:%.*]] = load float, float* [[TMP35]], align 4
-; CHECK-NEXT: [[TMP48:%.*]] = load float, float* [[TMP36]], align 4
-; CHECK-NEXT: [[TMP49:%.*]] = load float, float* [[TMP37]], align 4
-; CHECK-NEXT: [[TMP50:%.*]] = load float, float* [[TMP38]], align 4
-; CHECK-NEXT: [[TMP51:%.*]] = fadd fast float [[TMP15]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP52:%.*]] = fadd fast float [[TMP16]], [[VEC_PHI1]]
-; CHECK-NEXT: [[TMP53:%.*]] = fadd fast float [[TMP17]], [[VEC_PHI2]]
-; CHECK-NEXT: [[TMP54:%.*]] = fadd fast float [[TMP18]], [[VEC_PHI3]]
-; CHECK-NEXT: [[TMP55:%.*]] = fadd fast float [[TMP19]], [[VEC_PHI4]]
-; CHECK-NEXT: [[TMP56:%.*]] = fadd fast float [[TMP20]], [[VEC_PHI5]]
-; CHECK-NEXT: [[TMP57:%.*]] = fadd fast float [[TMP21]], [[VEC_PHI6]]
-; CHECK-NEXT: [[TMP58:%.*]] = fadd fast float [[TMP22]], [[VEC_PHI7]]
-; CHECK-NEXT: [[TMP59:%.*]] = fadd fast float [[TMP23]], [[VEC_PHI8]]
-; CHECK-NEXT: [[TMP60:%.*]] = fadd fast float [[TMP24]], [[VEC_PHI9]]
-; CHECK-NEXT: [[TMP61:%.*]] = fadd fast float [[TMP25]], [[VEC_PHI10]]
-; CHECK-NEXT: [[TMP62:%.*]] = fadd fast float [[TMP26]], [[VEC_PHI11]]
-; CHECK-NEXT: [[TMP63]] = fadd fast float [[TMP51]], [[TMP39]]
-; CHECK-NEXT: [[TMP64]] = fadd fast float [[TMP52]], [[TMP40]]
-; CHECK-NEXT: [[TMP65]] = fadd fast float [[TMP53]], [[TMP41]]
-; CHECK-NEXT: [[TMP66]] = fadd fast float [[TMP54]], [[TMP42]]
-; CHECK-NEXT: [[TMP67]] = fadd fast float [[TMP55]], [[TMP43]]
-; CHECK-NEXT: [[TMP68]] = fadd fast float [[TMP56]], [[TMP44]]
-; CHECK-NEXT: [[TMP69]] = fadd fast float [[TMP57]], [[TMP45]]
-; CHECK-NEXT: [[TMP70]] = fadd fast float [[TMP58]], [[TMP46]]
-; CHECK-NEXT: [[TMP71]] = fadd fast float [[TMP59]], [[TMP47]]
-; CHECK-NEXT: [[TMP72]] = fadd fast float [[TMP60]], [[TMP48]]
-; CHECK-NEXT: [[TMP73]] = fadd fast float [[TMP61]], [[TMP49]]
-; CHECK-NEXT: [[TMP74]] = fadd fast float [[TMP62]], [[TMP50]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 12
-; CHECK-NEXT: [[TMP75:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP75]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[IND_END:%.*]] = shl nuw nsw i64 [[N_VEC]], 5
-; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast float [[TMP64]], [[TMP63]]
-; CHECK-NEXT: [[BIN_RDX23:%.*]] = fadd fast float [[BIN_RDX]], [[TMP65]]
-; CHECK-NEXT: [[BIN_RDX24:%.*]] = fadd fast float [[BIN_RDX23]], [[TMP66]]
-; CHECK-NEXT: [[BIN_RDX25:%.*]] = fadd fast float [[BIN_RDX24]], [[TMP67]]
-; CHECK-NEXT: [[BIN_RDX26:%.*]] = fadd fast float [[BIN_RDX25]], [[TMP68]]
-; CHECK-NEXT: [[BIN_RDX27:%.*]] = fadd fast float [[BIN_RDX26]], [[TMP69]]
-; CHECK-NEXT: [[BIN_RDX28:%.*]] = fadd fast float [[BIN_RDX27]], [[TMP70]]
-; CHECK-NEXT: [[BIN_RDX29:%.*]] = fadd fast float [[BIN_RDX28]], [[TMP71]]
-; CHECK-NEXT: [[BIN_RDX30:%.*]] = fadd fast float [[BIN_RDX29]], [[TMP72]]
-; CHECK-NEXT: [[BIN_RDX31:%.*]] = fadd fast float [[BIN_RDX30]], [[TMP73]]
-; CHECK-NEXT: [[BIN_RDX32:%.*]] = fadd fast float [[BIN_RDX31]], [[TMP74]]
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_MOD_VF34]], 0
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_PREHEADER]]
-; CHECK: for.preheader:
-; CHECK-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[PREHEADER]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: [[S_02_PH:%.*]] = phi float [ 0.000000e+00, [[PREHEADER]] ], [ [[BIN_RDX32]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR:%.*]]
-; CHECK: for:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR]] ], [ [[INDVARS_IV_PH]], [[FOR_PREHEADER]] ]
-; CHECK-NEXT: [[S_02:%.*]] = phi float [ [[ADD4:%.*]], [[FOR]] ], [ [[S_02_PH]], [[FOR_PREHEADER]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[T1:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[T2:%.*]] = load float, float* [[ARRAYIDX3]], align 4
-; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[T1]], [[S_02]]
-; CHECK-NEXT: [[ADD4]] = fadd fast float [[ADD]], [[T2]]
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 32
-; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT]], [[T033]]
-; CHECK-NEXT: br i1 [[CMP1]], label [[FOR]], label [[FOR_END]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: [[S_0_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[BIN_RDX32]], [[MIDDLE_BLOCK]] ], [ [[ADD4]], [[FOR]] ]
-; CHECK-NEXT: ret float [[S_0_LCSSA]]
-;
+;CHECK-LABEL: float_
+;CHECK: LV(REG): VF = 1
+;CHECK: LV(REG): Found max usage: 2 item
+;CHECK-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 2 registers
+;CHECK-NEXT: LV(REG): RegisterClass: PPC::VSXRC, 3 registers
+;CHECK: LV(REG): Found invariant usage: 1 item
+;CHECK-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 1 registers
entry:
%cmp = icmp sgt i32 %n, 0
define void @double_(double* nocapture %A, i32 %n) nounwind uwtable ssp {
-; CHECK-PWR8-LABEL: @double_(
-; CHECK-PWR8-NEXT: [[TMP1:%.*]] = sext i32 [[N:%.*]] to i64
-; CHECK-PWR8-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64
-; CHECK-PWR8-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
-; CHECK-PWR8-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp eq i32 [[N]], 0
-; CHECK-PWR8-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH_PREHEADER:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-PWR8: vector.ph:
-; CHECK-PWR8-NEXT: [[N_VEC:%.*]] = and i64 [[TMP3]], 8589934590
-; CHECK-PWR8-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 -1
-; CHECK-PWR8-NEXT: [[TMP5:%.*]] = add nsw i64 [[N_VEC]], -2
-; CHECK-PWR8-NEXT: [[TMP6:%.*]] = lshr exact i64 [[TMP5]], 1
-; CHECK-PWR8-NEXT: [[TMP7:%.*]] = add nuw i64 [[TMP6]], 1
-; CHECK-PWR8-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP7]], 1
-; CHECK-PWR8-NEXT: [[TMP8:%.*]] = icmp eq i64 [[TMP5]], 0
-; CHECK-PWR8-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK_UNR_LCSSA:%.*]], label [[VECTOR_PH_NEW:%.*]]
-; CHECK-PWR8: vector.ph.new:
-; CHECK-PWR8-NEXT: [[UNROLL_ITER:%.*]] = and i64 [[TMP7]], -2
-; CHECK-PWR8-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-PWR8: vector.body:
-; CHECK-PWR8-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH_NEW]] ], [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR8-NEXT: [[NITER:%.*]] = phi i64 [ 0, [[VECTOR_PH_NEW]] ], [ [[NITER_NEXT_1:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR8-NEXT: [[OFFSET_IDX:%.*]] = sub nsw i64 [[TMP1]], [[INDEX]]
-; CHECK-PWR8-NEXT: [[TMP9:%.*]] = getelementptr inbounds double, double* [[TMP4]], i64 [[OFFSET_IDX]]
-; CHECK-PWR8-NEXT: [[TMP10:%.*]] = bitcast double* [[TMP9]] to <2 x double>*
-; CHECK-PWR8-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP10]], align 8
-; CHECK-PWR8-NEXT: [[REVERSE:%.*]] = shufflevector <2 x double> [[WIDE_LOAD]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-PWR8-NEXT: [[TMP11:%.*]] = fadd <2 x double> [[REVERSE]], <double 3.000000e+00, double 3.000000e+00>
-; CHECK-PWR8-NEXT: [[TMP12:%.*]] = fmul <2 x double> [[REVERSE]], <double 2.000000e+00, double 2.000000e+00>
-; CHECK-PWR8-NEXT: [[TMP13:%.*]] = fadd <2 x double> [[TMP11]], [[TMP12]]
-; CHECK-PWR8-NEXT: [[TMP14:%.*]] = fadd <2 x double> [[TMP13]], <double 2.000000e+00, double 2.000000e+00>
-; CHECK-PWR8-NEXT: [[TMP15:%.*]] = fmul <2 x double> [[TMP14]], <double 5.000000e-01, double 5.000000e-01>
-; CHECK-PWR8-NEXT: [[TMP16:%.*]] = fadd <2 x double> [[TMP12]], [[TMP15]]
-; CHECK-PWR8-NEXT: [[TMP17:%.*]] = fsub <2 x double> [[TMP16]], [[TMP11]]
-; CHECK-PWR8-NEXT: [[TMP18:%.*]] = fadd <2 x double> [[REVERSE]], [[TMP17]]
-; CHECK-PWR8-NEXT: [[TMP19:%.*]] = fdiv <2 x double> [[TMP14]], [[TMP18]]
-; CHECK-PWR8-NEXT: [[TMP20:%.*]] = fmul <2 x double> [[TMP14]], [[TMP19]]
-; CHECK-PWR8-NEXT: [[TMP21:%.*]] = fmul <2 x double> [[TMP12]], [[TMP20]]
-; CHECK-PWR8-NEXT: [[TMP22:%.*]] = fmul <2 x double> [[TMP11]], [[TMP21]]
-; CHECK-PWR8-NEXT: [[TMP23:%.*]] = fadd <2 x double> [[TMP22]], <double -3.000000e+00, double -3.000000e+00>
-; CHECK-PWR8-NEXT: [[TMP24:%.*]] = fsub <2 x double> [[REVERSE]], [[TMP11]]
-; CHECK-PWR8-NEXT: [[TMP25:%.*]] = fadd <2 x double> [[TMP12]], [[TMP24]]
-; CHECK-PWR8-NEXT: [[TMP26:%.*]] = fadd <2 x double> [[TMP25]], [[TMP19]]
-; CHECK-PWR8-NEXT: [[TMP27:%.*]] = fadd <2 x double> [[TMP26]], [[TMP23]]
-; CHECK-PWR8-NEXT: [[TMP28:%.*]] = fadd <2 x double> [[TMP27]], <double 3.000000e+00, double 3.000000e+00>
-; CHECK-PWR8-NEXT: [[TMP29:%.*]] = fmul <2 x double> [[REVERSE]], [[TMP28]]
-; CHECK-PWR8-NEXT: [[REVERSE1:%.*]] = shufflevector <2 x double> [[TMP29]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-PWR8-NEXT: store <2 x double> [[REVERSE1]], <2 x double>* [[TMP10]], align 8
-; CHECK-PWR8-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 2
-; CHECK-PWR8-NEXT: [[OFFSET_IDX_1:%.*]] = sub nsw i64 [[TMP1]], [[INDEX_NEXT]]
-; CHECK-PWR8-NEXT: [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP4]], i64 [[OFFSET_IDX_1]]
-; CHECK-PWR8-NEXT: [[TMP31:%.*]] = bitcast double* [[TMP30]] to <2 x double>*
-; CHECK-PWR8-NEXT: [[WIDE_LOAD_1:%.*]] = load <2 x double>, <2 x double>* [[TMP31]], align 8
-; CHECK-PWR8-NEXT: [[REVERSE_1:%.*]] = shufflevector <2 x double> [[WIDE_LOAD_1]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-PWR8-NEXT: [[TMP32:%.*]] = fadd <2 x double> [[REVERSE_1]], <double 3.000000e+00, double 3.000000e+00>
-; CHECK-PWR8-NEXT: [[TMP33:%.*]] = fmul <2 x double> [[REVERSE_1]], <double 2.000000e+00, double 2.000000e+00>
-; CHECK-PWR8-NEXT: [[TMP34:%.*]] = fadd <2 x double> [[TMP32]], [[TMP33]]
-; CHECK-PWR8-NEXT: [[TMP35:%.*]] = fadd <2 x double> [[TMP34]], <double 2.000000e+00, double 2.000000e+00>
-; CHECK-PWR8-NEXT: [[TMP36:%.*]] = fmul <2 x double> [[TMP35]], <double 5.000000e-01, double 5.000000e-01>
-; CHECK-PWR8-NEXT: [[TMP37:%.*]] = fadd <2 x double> [[TMP33]], [[TMP36]]
-; CHECK-PWR8-NEXT: [[TMP38:%.*]] = fsub <2 x double> [[TMP37]], [[TMP32]]
-; CHECK-PWR8-NEXT: [[TMP39:%.*]] = fadd <2 x double> [[REVERSE_1]], [[TMP38]]
-; CHECK-PWR8-NEXT: [[TMP40:%.*]] = fdiv <2 x double> [[TMP35]], [[TMP39]]
-; CHECK-PWR8-NEXT: [[TMP41:%.*]] = fmul <2 x double> [[TMP35]], [[TMP40]]
-; CHECK-PWR8-NEXT: [[TMP42:%.*]] = fmul <2 x double> [[TMP33]], [[TMP41]]
-; CHECK-PWR8-NEXT: [[TMP43:%.*]] = fmul <2 x double> [[TMP32]], [[TMP42]]
-; CHECK-PWR8-NEXT: [[TMP44:%.*]] = fadd <2 x double> [[TMP43]], <double -3.000000e+00, double -3.000000e+00>
-; CHECK-PWR8-NEXT: [[TMP45:%.*]] = fsub <2 x double> [[REVERSE_1]], [[TMP32]]
-; CHECK-PWR8-NEXT: [[TMP46:%.*]] = fadd <2 x double> [[TMP33]], [[TMP45]]
-; CHECK-PWR8-NEXT: [[TMP47:%.*]] = fadd <2 x double> [[TMP46]], [[TMP40]]
-; CHECK-PWR8-NEXT: [[TMP48:%.*]] = fadd <2 x double> [[TMP47]], [[TMP44]]
-; CHECK-PWR8-NEXT: [[TMP49:%.*]] = fadd <2 x double> [[TMP48]], <double 3.000000e+00, double 3.000000e+00>
-; CHECK-PWR8-NEXT: [[TMP50:%.*]] = fmul <2 x double> [[REVERSE_1]], [[TMP49]]
-; CHECK-PWR8-NEXT: [[REVERSE1_1:%.*]] = shufflevector <2 x double> [[TMP50]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-PWR8-NEXT: store <2 x double> [[REVERSE1_1]], <2 x double>* [[TMP31]], align 8
-; CHECK-PWR8-NEXT: [[INDEX_NEXT_1]] = add nuw nsw i64 [[INDEX]], 4
-; CHECK-PWR8-NEXT: [[NITER_NEXT_1]] = add nuw nsw i64 [[NITER]], 2
-; CHECK-PWR8-NEXT: [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]]
-; CHECK-PWR8-NEXT: br i1 [[NITER_NCMP_1]], label [[MIDDLE_BLOCK_UNR_LCSSA]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK-PWR8: middle.block.unr-lcssa:
-; CHECK-PWR8-NEXT: [[INDEX_UNR:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT_1]], [[VECTOR_BODY]] ]
-; CHECK-PWR8-NEXT: [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0
-; CHECK-PWR8-NEXT: br i1 [[LCMP_MOD_NOT]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY_EPIL:%.*]]
-; CHECK-PWR8: vector.body.epil:
-; CHECK-PWR8-NEXT: [[OFFSET_IDX_EPIL:%.*]] = sub nsw i64 [[TMP1]], [[INDEX_UNR]]
-; CHECK-PWR8-NEXT: [[TMP51:%.*]] = getelementptr inbounds double, double* [[TMP4]], i64 [[OFFSET_IDX_EPIL]]
-; CHECK-PWR8-NEXT: [[TMP52:%.*]] = bitcast double* [[TMP51]] to <2 x double>*
-; CHECK-PWR8-NEXT: [[WIDE_LOAD_EPIL:%.*]] = load <2 x double>, <2 x double>* [[TMP52]], align 8
-; CHECK-PWR8-NEXT: [[REVERSE_EPIL:%.*]] = shufflevector <2 x double> [[WIDE_LOAD_EPIL]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-PWR8-NEXT: [[TMP53:%.*]] = fadd <2 x double> [[REVERSE_EPIL]], <double 3.000000e+00, double 3.000000e+00>
-; CHECK-PWR8-NEXT: [[TMP54:%.*]] = fmul <2 x double> [[REVERSE_EPIL]], <double 2.000000e+00, double 2.000000e+00>
-; CHECK-PWR8-NEXT: [[TMP55:%.*]] = fadd <2 x double> [[TMP53]], [[TMP54]]
-; CHECK-PWR8-NEXT: [[TMP56:%.*]] = fadd <2 x double> [[TMP55]], <double 2.000000e+00, double 2.000000e+00>
-; CHECK-PWR8-NEXT: [[TMP57:%.*]] = fmul <2 x double> [[TMP56]], <double 5.000000e-01, double 5.000000e-01>
-; CHECK-PWR8-NEXT: [[TMP58:%.*]] = fadd <2 x double> [[TMP54]], [[TMP57]]
-; CHECK-PWR8-NEXT: [[TMP59:%.*]] = fsub <2 x double> [[TMP58]], [[TMP53]]
-; CHECK-PWR8-NEXT: [[TMP60:%.*]] = fadd <2 x double> [[REVERSE_EPIL]], [[TMP59]]
-; CHECK-PWR8-NEXT: [[TMP61:%.*]] = fdiv <2 x double> [[TMP56]], [[TMP60]]
-; CHECK-PWR8-NEXT: [[TMP62:%.*]] = fmul <2 x double> [[TMP56]], [[TMP61]]
-; CHECK-PWR8-NEXT: [[TMP63:%.*]] = fmul <2 x double> [[TMP54]], [[TMP62]]
-; CHECK-PWR8-NEXT: [[TMP64:%.*]] = fmul <2 x double> [[TMP53]], [[TMP63]]
-; CHECK-PWR8-NEXT: [[TMP65:%.*]] = fadd <2 x double> [[TMP64]], <double -3.000000e+00, double -3.000000e+00>
-; CHECK-PWR8-NEXT: [[TMP66:%.*]] = fsub <2 x double> [[REVERSE_EPIL]], [[TMP53]]
-; CHECK-PWR8-NEXT: [[TMP67:%.*]] = fadd <2 x double> [[TMP54]], [[TMP66]]
-; CHECK-PWR8-NEXT: [[TMP68:%.*]] = fadd <2 x double> [[TMP67]], [[TMP61]]
-; CHECK-PWR8-NEXT: [[TMP69:%.*]] = fadd <2 x double> [[TMP68]], [[TMP65]]
-; CHECK-PWR8-NEXT: [[TMP70:%.*]] = fadd <2 x double> [[TMP69]], <double 3.000000e+00, double 3.000000e+00>
-; CHECK-PWR8-NEXT: [[TMP71:%.*]] = fmul <2 x double> [[REVERSE_EPIL]], [[TMP70]]
-; CHECK-PWR8-NEXT: [[REVERSE1_EPIL:%.*]] = shufflevector <2 x double> [[TMP71]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-PWR8-NEXT: store <2 x double> [[REVERSE1_EPIL]], <2 x double>* [[TMP52]], align 8
-; CHECK-PWR8-NEXT: br label [[MIDDLE_BLOCK]]
-; CHECK-PWR8: middle.block:
-; CHECK-PWR8-NEXT: [[IND_END:%.*]] = sub nsw i64 [[TMP1]], [[N_VEC]]
-; CHECK-PWR8-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
-; CHECK-PWR8-NEXT: br i1 [[CMP_N]], label [[DOTLOOPEXIT:%.*]], label [[SCALAR_PH_PREHEADER]]
-; CHECK-PWR8: scalar.ph.preheader:
-; CHECK-PWR8-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ]
-; CHECK-PWR8-NEXT: br label [[SCALAR_PH:%.*]]
-; CHECK-PWR8: scalar.ph:
-; CHECK-PWR8-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[SCALAR_PH]] ], [ [[INDVARS_IV_PH]], [[SCALAR_PH_PREHEADER]] ]
-; CHECK-PWR8-NEXT: [[TMP72:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV]]
-; CHECK-PWR8-NEXT: [[TMP73:%.*]] = load double, double* [[TMP72]], align 8
-; CHECK-PWR8-NEXT: [[TMP74:%.*]] = fadd double [[TMP73]], 3.000000e+00
-; CHECK-PWR8-NEXT: [[TMP75:%.*]] = fmul double [[TMP73]], 2.000000e+00
-; CHECK-PWR8-NEXT: [[TMP76:%.*]] = fadd double [[TMP74]], [[TMP75]]
-; CHECK-PWR8-NEXT: [[TMP77:%.*]] = fadd double [[TMP76]], 2.000000e+00
-; CHECK-PWR8-NEXT: [[TMP78:%.*]] = fmul double [[TMP77]], 5.000000e-01
-; CHECK-PWR8-NEXT: [[TMP79:%.*]] = fadd double [[TMP75]], [[TMP78]]
-; CHECK-PWR8-NEXT: [[TMP80:%.*]] = fsub double [[TMP79]], [[TMP74]]
-; CHECK-PWR8-NEXT: [[TMP81:%.*]] = fadd double [[TMP73]], [[TMP80]]
-; CHECK-PWR8-NEXT: [[TMP82:%.*]] = fdiv double [[TMP77]], [[TMP81]]
-; CHECK-PWR8-NEXT: [[TMP83:%.*]] = fmul double [[TMP77]], [[TMP82]]
-; CHECK-PWR8-NEXT: [[TMP84:%.*]] = fmul double [[TMP75]], [[TMP83]]
-; CHECK-PWR8-NEXT: [[TMP85:%.*]] = fmul double [[TMP74]], [[TMP84]]
-; CHECK-PWR8-NEXT: [[TMP86:%.*]] = fadd double [[TMP85]], -3.000000e+00
-; CHECK-PWR8-NEXT: [[TMP87:%.*]] = fsub double [[TMP73]], [[TMP74]]
-; CHECK-PWR8-NEXT: [[TMP88:%.*]] = fadd double [[TMP75]], [[TMP87]]
-; CHECK-PWR8-NEXT: [[TMP89:%.*]] = fadd double [[TMP88]], [[TMP82]]
-; CHECK-PWR8-NEXT: [[TMP90:%.*]] = fadd double [[TMP89]], [[TMP86]]
-; CHECK-PWR8-NEXT: [[TMP91:%.*]] = fadd double [[TMP90]], 3.000000e+00
-; CHECK-PWR8-NEXT: [[TMP92:%.*]] = fmul double [[TMP73]], [[TMP91]]
-; CHECK-PWR8-NEXT: store double [[TMP92]], double* [[TMP72]], align 8
-; CHECK-PWR8-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
-; CHECK-PWR8-NEXT: [[TMP93:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-PWR8-NEXT: [[TMP94:%.*]] = icmp eq i32 [[TMP93]], 0
-; CHECK-PWR8-NEXT: br i1 [[TMP94]], label [[DOTLOOPEXIT]], label [[SCALAR_PH]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK-PWR8: .loopexit:
-; CHECK-PWR8-NEXT: ret void
-;
-; CHECK-PWR9-LABEL: @double_(
-; CHECK-PWR9-NEXT: [[TMP1:%.*]] = sext i32 [[N:%.*]] to i64
-; CHECK-PWR9-NEXT: [[TMP2:%.*]] = and i32 [[N]], 1
-; CHECK-PWR9-NEXT: [[LCMP_MOD_NOT_NOT:%.*]] = icmp eq i32 [[TMP2]], 0
-; CHECK-PWR9-NEXT: br i1 [[LCMP_MOD_NOT_NOT]], label [[DOTPROL_LOOPEXIT_UNR_LCSSA:%.*]], label [[DOTPROL_LOOPEXIT:%.*]]
-; CHECK-PWR9: .prol.loopexit.unr-lcssa:
-; CHECK-PWR9-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[TMP1]]
-; CHECK-PWR9-NEXT: [[TMP4:%.*]] = load double, double* [[TMP3]], align 8
-; CHECK-PWR9-NEXT: [[TMP5:%.*]] = fadd double [[TMP4]], 3.000000e+00
-; CHECK-PWR9-NEXT: [[TMP6:%.*]] = fmul double [[TMP4]], 2.000000e+00
-; CHECK-PWR9-NEXT: [[TMP7:%.*]] = fadd double [[TMP5]], [[TMP6]]
-; CHECK-PWR9-NEXT: [[TMP8:%.*]] = fadd double [[TMP7]], 2.000000e+00
-; CHECK-PWR9-NEXT: [[TMP9:%.*]] = fmul double [[TMP8]], 5.000000e-01
-; CHECK-PWR9-NEXT: [[TMP10:%.*]] = fadd double [[TMP6]], [[TMP9]]
-; CHECK-PWR9-NEXT: [[TMP11:%.*]] = fsub double [[TMP10]], [[TMP5]]
-; CHECK-PWR9-NEXT: [[TMP12:%.*]] = fadd double [[TMP4]], [[TMP11]]
-; CHECK-PWR9-NEXT: [[TMP13:%.*]] = fdiv double [[TMP8]], [[TMP12]]
-; CHECK-PWR9-NEXT: [[TMP14:%.*]] = fmul double [[TMP8]], [[TMP13]]
-; CHECK-PWR9-NEXT: [[TMP15:%.*]] = fmul double [[TMP6]], [[TMP14]]
-; CHECK-PWR9-NEXT: [[TMP16:%.*]] = fmul double [[TMP5]], [[TMP15]]
-; CHECK-PWR9-NEXT: [[TMP17:%.*]] = fadd double [[TMP16]], -3.000000e+00
-; CHECK-PWR9-NEXT: [[TMP18:%.*]] = fsub double [[TMP4]], [[TMP5]]
-; CHECK-PWR9-NEXT: [[TMP19:%.*]] = fadd double [[TMP6]], [[TMP18]]
-; CHECK-PWR9-NEXT: [[TMP20:%.*]] = fadd double [[TMP19]], [[TMP13]]
-; CHECK-PWR9-NEXT: [[TMP21:%.*]] = fadd double [[TMP20]], [[TMP17]]
-; CHECK-PWR9-NEXT: [[TMP22:%.*]] = fadd double [[TMP21]], 3.000000e+00
-; CHECK-PWR9-NEXT: [[TMP23:%.*]] = fmul double [[TMP4]], [[TMP22]]
-; CHECK-PWR9-NEXT: store double [[TMP23]], double* [[TMP3]], align 8
-; CHECK-PWR9-NEXT: [[INDVARS_IV_NEXT_PROL:%.*]] = add nsw i64 [[TMP1]], -1
-; CHECK-PWR9-NEXT: br label [[DOTPROL_LOOPEXIT]]
-; CHECK-PWR9: .prol.loopexit:
-; CHECK-PWR9-NEXT: [[INDVARS_IV_UNR:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[INDVARS_IV_NEXT_PROL]], [[DOTPROL_LOOPEXIT_UNR_LCSSA]] ]
-; CHECK-PWR9-NEXT: [[TMP24:%.*]] = icmp eq i32 [[N]], 0
-; CHECK-PWR9-NEXT: br i1 [[TMP24]], label [[DOTUNR_LCSSA:%.*]], label [[DOTNEW:%.*]]
-; CHECK-PWR9: .new:
-; CHECK-PWR9-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_1:%.*]], [[DOTNEW]] ], [ [[INDVARS_IV_UNR]], [[DOTPROL_LOOPEXIT]] ]
-; CHECK-PWR9-NEXT: [[TMP25:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV]]
-; CHECK-PWR9-NEXT: [[TMP26:%.*]] = load double, double* [[TMP25]], align 8
-; CHECK-PWR9-NEXT: [[TMP27:%.*]] = fadd double [[TMP26]], 3.000000e+00
-; CHECK-PWR9-NEXT: [[TMP28:%.*]] = fmul double [[TMP26]], 2.000000e+00
-; CHECK-PWR9-NEXT: [[TMP29:%.*]] = fadd double [[TMP27]], [[TMP28]]
-; CHECK-PWR9-NEXT: [[TMP30:%.*]] = fadd double [[TMP29]], 2.000000e+00
-; CHECK-PWR9-NEXT: [[TMP31:%.*]] = fmul double [[TMP30]], 5.000000e-01
-; CHECK-PWR9-NEXT: [[TMP32:%.*]] = fadd double [[TMP28]], [[TMP31]]
-; CHECK-PWR9-NEXT: [[TMP33:%.*]] = fsub double [[TMP32]], [[TMP27]]
-; CHECK-PWR9-NEXT: [[TMP34:%.*]] = fadd double [[TMP26]], [[TMP33]]
-; CHECK-PWR9-NEXT: [[TMP35:%.*]] = fdiv double [[TMP30]], [[TMP34]]
-; CHECK-PWR9-NEXT: [[TMP36:%.*]] = fmul double [[TMP30]], [[TMP35]]
-; CHECK-PWR9-NEXT: [[TMP37:%.*]] = fmul double [[TMP28]], [[TMP36]]
-; CHECK-PWR9-NEXT: [[TMP38:%.*]] = fmul double [[TMP27]], [[TMP37]]
-; CHECK-PWR9-NEXT: [[TMP39:%.*]] = fadd double [[TMP38]], -3.000000e+00
-; CHECK-PWR9-NEXT: [[TMP40:%.*]] = fsub double [[TMP26]], [[TMP27]]
-; CHECK-PWR9-NEXT: [[TMP41:%.*]] = fadd double [[TMP28]], [[TMP40]]
-; CHECK-PWR9-NEXT: [[TMP42:%.*]] = fadd double [[TMP41]], [[TMP35]]
-; CHECK-PWR9-NEXT: [[TMP43:%.*]] = fadd double [[TMP42]], [[TMP39]]
-; CHECK-PWR9-NEXT: [[TMP44:%.*]] = fadd double [[TMP43]], 3.000000e+00
-; CHECK-PWR9-NEXT: [[TMP45:%.*]] = fmul double [[TMP26]], [[TMP44]]
-; CHECK-PWR9-NEXT: store double [[TMP45]], double* [[TMP25]], align 8
-; CHECK-PWR9-NEXT: [[INDVARS_IV_NEXT:%.*]] = add nsw i64 [[INDVARS_IV]], -1
-; CHECK-PWR9-NEXT: [[TMP46:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT]]
-; CHECK-PWR9-NEXT: [[TMP47:%.*]] = load double, double* [[TMP46]], align 8
-; CHECK-PWR9-NEXT: [[TMP48:%.*]] = fadd double [[TMP47]], 3.000000e+00
-; CHECK-PWR9-NEXT: [[TMP49:%.*]] = fmul double [[TMP47]], 2.000000e+00
-; CHECK-PWR9-NEXT: [[TMP50:%.*]] = fadd double [[TMP48]], [[TMP49]]
-; CHECK-PWR9-NEXT: [[TMP51:%.*]] = fadd double [[TMP50]], 2.000000e+00
-; CHECK-PWR9-NEXT: [[TMP52:%.*]] = fmul double [[TMP51]], 5.000000e-01
-; CHECK-PWR9-NEXT: [[TMP53:%.*]] = fadd double [[TMP49]], [[TMP52]]
-; CHECK-PWR9-NEXT: [[TMP54:%.*]] = fsub double [[TMP53]], [[TMP48]]
-; CHECK-PWR9-NEXT: [[TMP55:%.*]] = fadd double [[TMP47]], [[TMP54]]
-; CHECK-PWR9-NEXT: [[TMP56:%.*]] = fdiv double [[TMP51]], [[TMP55]]
-; CHECK-PWR9-NEXT: [[TMP57:%.*]] = fmul double [[TMP51]], [[TMP56]]
-; CHECK-PWR9-NEXT: [[TMP58:%.*]] = fmul double [[TMP49]], [[TMP57]]
-; CHECK-PWR9-NEXT: [[TMP59:%.*]] = fmul double [[TMP48]], [[TMP58]]
-; CHECK-PWR9-NEXT: [[TMP60:%.*]] = fadd double [[TMP59]], -3.000000e+00
-; CHECK-PWR9-NEXT: [[TMP61:%.*]] = fsub double [[TMP47]], [[TMP48]]
-; CHECK-PWR9-NEXT: [[TMP62:%.*]] = fadd double [[TMP49]], [[TMP61]]
-; CHECK-PWR9-NEXT: [[TMP63:%.*]] = fadd double [[TMP62]], [[TMP56]]
-; CHECK-PWR9-NEXT: [[TMP64:%.*]] = fadd double [[TMP63]], [[TMP60]]
-; CHECK-PWR9-NEXT: [[TMP65:%.*]] = fadd double [[TMP64]], 3.000000e+00
-; CHECK-PWR9-NEXT: [[TMP66:%.*]] = fmul double [[TMP47]], [[TMP65]]
-; CHECK-PWR9-NEXT: store double [[TMP66]], double* [[TMP46]], align 8
-; CHECK-PWR9-NEXT: [[INDVARS_IV_NEXT_1]] = add nsw i64 [[INDVARS_IV]], -2
-; CHECK-PWR9-NEXT: [[TMP67:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-PWR9-NEXT: [[TMP68:%.*]] = icmp eq i32 [[TMP67]], 0
-; CHECK-PWR9-NEXT: br i1 [[TMP68]], label [[DOTUNR_LCSSA]], label [[DOTNEW]]
-; CHECK-PWR9: .unr-lcssa:
-; CHECK-PWR9-NEXT: ret void
-;
-
+;CHECK-LABEL: double_
+;CHECK-PWR8: LV(REG): VF = 2
+;CHECK-PWR8: LV(REG): Found max usage: 2 item
+;CHECK-PWR8-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 2 registers
+;CHECK-PWR8-NEXT: LV(REG): RegisterClass: PPC::VSXRC, 5 registers
+;CHECK-PWR8: LV(REG): Found invariant usage: 1 item
+;CHECK-PWR8-NEXT: LV(REG): RegisterClass: PPC::VSXRC, 1 registers
+
+;CHECK-PWR9: LV(REG): VF = 1
+;CHECK-PWR9: LV(REG): Found max usage: 2 item
+;CHECK-PWR9-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 2 registers
+;CHECK-PWR9-NEXT: LV(REG): RegisterClass: PPC::VSXRC, 5 registers
+;CHECK-PWR9: LV(REG): Found invariant usage: 1 item
+;CHECK-PWR9-NEXT: LV(REG): RegisterClass: PPC::GPRRC, 1 registers
%1 = sext i32 %n to i64
br label %2
}
define ppc_fp128 @fp128_(ppc_fp128* nocapture %n, ppc_fp128 %d) nounwind readonly {
-; CHECK-PWR8-LABEL: @fp128_(
-; CHECK-PWR8-NEXT: entry:
-; CHECK-PWR8-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-PWR8: vector.body:
-; CHECK-PWR8-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR8-NEXT: [[VEC_PHI:%.*]] = phi ppc_fp128 [ [[D:%.*]], [[ENTRY]] ], [ [[TMP36:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR8-NEXT: [[VEC_PHI1:%.*]] = phi ppc_fp128 [ 0xM00000000000000000000000000000000, [[ENTRY]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR8-NEXT: [[VEC_PHI2:%.*]] = phi ppc_fp128 [ 0xM00000000000000000000000000000000, [[ENTRY]] ], [ [[TMP38:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR8-NEXT: [[VEC_PHI3:%.*]] = phi ppc_fp128 [ 0xM00000000000000000000000000000000, [[ENTRY]] ], [ [[TMP39:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR8-NEXT: [[VEC_PHI4:%.*]] = phi ppc_fp128 [ 0xM00000000000000000000000000000000, [[ENTRY]] ], [ [[TMP40:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR8-NEXT: [[VEC_PHI5:%.*]] = phi ppc_fp128 [ 0xM00000000000000000000000000000000, [[ENTRY]] ], [ [[TMP41:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR8-NEXT: [[VEC_PHI6:%.*]] = phi ppc_fp128 [ 0xM00000000000000000000000000000000, [[ENTRY]] ], [ [[TMP42:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR8-NEXT: [[VEC_PHI7:%.*]] = phi ppc_fp128 [ 0xM00000000000000000000000000000000, [[ENTRY]] ], [ [[TMP43:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR8-NEXT: [[VEC_PHI8:%.*]] = phi ppc_fp128 [ 0xM00000000000000000000000000000000, [[ENTRY]] ], [ [[TMP44:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR8-NEXT: [[VEC_PHI9:%.*]] = phi ppc_fp128 [ 0xM00000000000000000000000000000000, [[ENTRY]] ], [ [[TMP45:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR8-NEXT: [[VEC_PHI10:%.*]] = phi ppc_fp128 [ 0xM00000000000000000000000000000000, [[ENTRY]] ], [ [[TMP46:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR8-NEXT: [[VEC_PHI11:%.*]] = phi ppc_fp128 [ 0xM00000000000000000000000000000000, [[ENTRY]] ], [ [[TMP47:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR8-NEXT: [[INDUCTION12:%.*]] = or i32 [[INDEX]], 1
-; CHECK-PWR8-NEXT: [[INDUCTION13:%.*]] = or i32 [[INDEX]], 2
-; CHECK-PWR8-NEXT: [[INDUCTION14:%.*]] = or i32 [[INDEX]], 3
-; CHECK-PWR8-NEXT: [[INDUCTION15:%.*]] = add nuw nsw i32 [[INDEX]], 4
-; CHECK-PWR8-NEXT: [[INDUCTION16:%.*]] = add nuw nsw i32 [[INDEX]], 5
-; CHECK-PWR8-NEXT: [[INDUCTION17:%.*]] = add nuw nsw i32 [[INDEX]], 6
-; CHECK-PWR8-NEXT: [[INDUCTION18:%.*]] = add nuw nsw i32 [[INDEX]], 7
-; CHECK-PWR8-NEXT: [[INDUCTION19:%.*]] = add nuw nsw i32 [[INDEX]], 8
-; CHECK-PWR8-NEXT: [[INDUCTION20:%.*]] = add nuw nsw i32 [[INDEX]], 9
-; CHECK-PWR8-NEXT: [[INDUCTION21:%.*]] = add nuw nsw i32 [[INDEX]], 10
-; CHECK-PWR8-NEXT: [[INDUCTION22:%.*]] = add nuw nsw i32 [[INDEX]], 11
-; CHECK-PWR8-NEXT: [[TMP0:%.*]] = zext i32 [[INDEX]] to i64
-; CHECK-PWR8-NEXT: [[TMP1:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N:%.*]], i64 [[TMP0]]
-; CHECK-PWR8-NEXT: [[TMP2:%.*]] = zext i32 [[INDUCTION12]] to i64
-; CHECK-PWR8-NEXT: [[TMP3:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 [[TMP2]]
-; CHECK-PWR8-NEXT: [[TMP4:%.*]] = zext i32 [[INDUCTION13]] to i64
-; CHECK-PWR8-NEXT: [[TMP5:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 [[TMP4]]
-; CHECK-PWR8-NEXT: [[TMP6:%.*]] = zext i32 [[INDUCTION14]] to i64
-; CHECK-PWR8-NEXT: [[TMP7:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 [[TMP6]]
-; CHECK-PWR8-NEXT: [[TMP8:%.*]] = zext i32 [[INDUCTION15]] to i64
-; CHECK-PWR8-NEXT: [[TMP9:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 [[TMP8]]
-; CHECK-PWR8-NEXT: [[TMP10:%.*]] = zext i32 [[INDUCTION16]] to i64
-; CHECK-PWR8-NEXT: [[TMP11:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 [[TMP10]]
-; CHECK-PWR8-NEXT: [[TMP12:%.*]] = zext i32 [[INDUCTION17]] to i64
-; CHECK-PWR8-NEXT: [[TMP13:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 [[TMP12]]
-; CHECK-PWR8-NEXT: [[TMP14:%.*]] = zext i32 [[INDUCTION18]] to i64
-; CHECK-PWR8-NEXT: [[TMP15:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 [[TMP14]]
-; CHECK-PWR8-NEXT: [[TMP16:%.*]] = zext i32 [[INDUCTION19]] to i64
-; CHECK-PWR8-NEXT: [[TMP17:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 [[TMP16]]
-; CHECK-PWR8-NEXT: [[TMP18:%.*]] = zext i32 [[INDUCTION20]] to i64
-; CHECK-PWR8-NEXT: [[TMP19:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 [[TMP18]]
-; CHECK-PWR8-NEXT: [[TMP20:%.*]] = zext i32 [[INDUCTION21]] to i64
-; CHECK-PWR8-NEXT: [[TMP21:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 [[TMP20]]
-; CHECK-PWR8-NEXT: [[TMP22:%.*]] = zext i32 [[INDUCTION22]] to i64
-; CHECK-PWR8-NEXT: [[TMP23:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 [[TMP22]]
-; CHECK-PWR8-NEXT: [[TMP24:%.*]] = load ppc_fp128, ppc_fp128* [[TMP1]], align 8
-; CHECK-PWR8-NEXT: [[TMP25:%.*]] = load ppc_fp128, ppc_fp128* [[TMP3]], align 8
-; CHECK-PWR8-NEXT: [[TMP26:%.*]] = load ppc_fp128, ppc_fp128* [[TMP5]], align 8
-; CHECK-PWR8-NEXT: [[TMP27:%.*]] = load ppc_fp128, ppc_fp128* [[TMP7]], align 8
-; CHECK-PWR8-NEXT: [[TMP28:%.*]] = load ppc_fp128, ppc_fp128* [[TMP9]], align 8
-; CHECK-PWR8-NEXT: [[TMP29:%.*]] = load ppc_fp128, ppc_fp128* [[TMP11]], align 8
-; CHECK-PWR8-NEXT: [[TMP30:%.*]] = load ppc_fp128, ppc_fp128* [[TMP13]], align 8
-; CHECK-PWR8-NEXT: [[TMP31:%.*]] = load ppc_fp128, ppc_fp128* [[TMP15]], align 8
-; CHECK-PWR8-NEXT: [[TMP32:%.*]] = load ppc_fp128, ppc_fp128* [[TMP17]], align 8
-; CHECK-PWR8-NEXT: [[TMP33:%.*]] = load ppc_fp128, ppc_fp128* [[TMP19]], align 8
-; CHECK-PWR8-NEXT: [[TMP34:%.*]] = load ppc_fp128, ppc_fp128* [[TMP21]], align 8
-; CHECK-PWR8-NEXT: [[TMP35:%.*]] = load ppc_fp128, ppc_fp128* [[TMP23]], align 8
-; CHECK-PWR8-NEXT: [[TMP36]] = fsub fast ppc_fp128 [[VEC_PHI]], [[TMP24]]
-; CHECK-PWR8-NEXT: [[TMP37]] = fsub fast ppc_fp128 [[VEC_PHI1]], [[TMP25]]
-; CHECK-PWR8-NEXT: [[TMP38]] = fsub fast ppc_fp128 [[VEC_PHI2]], [[TMP26]]
-; CHECK-PWR8-NEXT: [[TMP39]] = fsub fast ppc_fp128 [[VEC_PHI3]], [[TMP27]]
-; CHECK-PWR8-NEXT: [[TMP40]] = fsub fast ppc_fp128 [[VEC_PHI4]], [[TMP28]]
-; CHECK-PWR8-NEXT: [[TMP41]] = fsub fast ppc_fp128 [[VEC_PHI5]], [[TMP29]]
-; CHECK-PWR8-NEXT: [[TMP42]] = fsub fast ppc_fp128 [[VEC_PHI6]], [[TMP30]]
-; CHECK-PWR8-NEXT: [[TMP43]] = fsub fast ppc_fp128 [[VEC_PHI7]], [[TMP31]]
-; CHECK-PWR8-NEXT: [[TMP44]] = fsub fast ppc_fp128 [[VEC_PHI8]], [[TMP32]]
-; CHECK-PWR8-NEXT: [[TMP45]] = fsub fast ppc_fp128 [[VEC_PHI9]], [[TMP33]]
-; CHECK-PWR8-NEXT: [[TMP46]] = fsub fast ppc_fp128 [[VEC_PHI10]], [[TMP34]]
-; CHECK-PWR8-NEXT: [[TMP47]] = fsub fast ppc_fp128 [[VEC_PHI11]], [[TMP35]]
-; CHECK-PWR8-NEXT: [[INDEX_NEXT]] = add nuw nsw i32 [[INDEX]], 12
-; CHECK-PWR8-NEXT: [[TMP48:%.*]] = icmp eq i32 [[INDEX_NEXT]], 2040
-; CHECK-PWR8-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK-PWR8: middle.block:
-; CHECK-PWR8-NEXT: [[BIN_RDX:%.*]] = fadd fast ppc_fp128 [[TMP37]], [[TMP36]]
-; CHECK-PWR8-NEXT: [[BIN_RDX23:%.*]] = fadd fast ppc_fp128 [[BIN_RDX]], [[TMP38]]
-; CHECK-PWR8-NEXT: [[BIN_RDX24:%.*]] = fadd fast ppc_fp128 [[BIN_RDX23]], [[TMP39]]
-; CHECK-PWR8-NEXT: [[BIN_RDX25:%.*]] = fadd fast ppc_fp128 [[BIN_RDX24]], [[TMP40]]
-; CHECK-PWR8-NEXT: [[BIN_RDX26:%.*]] = fadd fast ppc_fp128 [[BIN_RDX25]], [[TMP41]]
-; CHECK-PWR8-NEXT: [[BIN_RDX27:%.*]] = fadd fast ppc_fp128 [[BIN_RDX26]], [[TMP42]]
-; CHECK-PWR8-NEXT: [[BIN_RDX28:%.*]] = fadd fast ppc_fp128 [[BIN_RDX27]], [[TMP43]]
-; CHECK-PWR8-NEXT: [[BIN_RDX29:%.*]] = fadd fast ppc_fp128 [[BIN_RDX28]], [[TMP44]]
-; CHECK-PWR8-NEXT: [[BIN_RDX30:%.*]] = fadd fast ppc_fp128 [[BIN_RDX29]], [[TMP45]]
-; CHECK-PWR8-NEXT: [[BIN_RDX31:%.*]] = fadd fast ppc_fp128 [[BIN_RDX30]], [[TMP46]]
-; CHECK-PWR8-NEXT: [[BIN_RDX32:%.*]] = fadd fast ppc_fp128 [[BIN_RDX31]], [[TMP47]]
-; CHECK-PWR8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 2040
-; CHECK-PWR8-NEXT: [[TMP49:%.*]] = load ppc_fp128, ppc_fp128* [[ARRAYIDX]], align 8
-; CHECK-PWR8-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 2041
-; CHECK-PWR8-NEXT: [[TMP50:%.*]] = load ppc_fp128, ppc_fp128* [[ARRAYIDX_1]], align 8
-; CHECK-PWR8-NEXT: [[TMP51:%.*]] = fadd fast ppc_fp128 [[TMP49]], [[TMP50]]
-; CHECK-PWR8-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 2042
-; CHECK-PWR8-NEXT: [[TMP52:%.*]] = load ppc_fp128, ppc_fp128* [[ARRAYIDX_2]], align 8
-; CHECK-PWR8-NEXT: [[TMP53:%.*]] = fadd fast ppc_fp128 [[TMP51]], [[TMP52]]
-; CHECK-PWR8-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 2043
-; CHECK-PWR8-NEXT: [[TMP54:%.*]] = load ppc_fp128, ppc_fp128* [[ARRAYIDX_3]], align 8
-; CHECK-PWR8-NEXT: [[TMP55:%.*]] = fadd fast ppc_fp128 [[TMP53]], [[TMP54]]
-; CHECK-PWR8-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 2044
-; CHECK-PWR8-NEXT: [[TMP56:%.*]] = load ppc_fp128, ppc_fp128* [[ARRAYIDX_4]], align 8
-; CHECK-PWR8-NEXT: [[TMP57:%.*]] = fadd fast ppc_fp128 [[TMP55]], [[TMP56]]
-; CHECK-PWR8-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 2045
-; CHECK-PWR8-NEXT: [[TMP58:%.*]] = load ppc_fp128, ppc_fp128* [[ARRAYIDX_5]], align 8
-; CHECK-PWR8-NEXT: [[TMP59:%.*]] = fadd fast ppc_fp128 [[TMP57]], [[TMP58]]
-; CHECK-PWR8-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 2046
-; CHECK-PWR8-NEXT: [[TMP60:%.*]] = load ppc_fp128, ppc_fp128* [[ARRAYIDX_6]], align 8
-; CHECK-PWR8-NEXT: [[TMP61:%.*]] = fadd fast ppc_fp128 [[TMP59]], [[TMP60]]
-; CHECK-PWR8-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 2047
-; CHECK-PWR8-NEXT: [[TMP62:%.*]] = load ppc_fp128, ppc_fp128* [[ARRAYIDX_7]], align 8
-; CHECK-PWR8-NEXT: [[TMP63:%.*]] = fadd fast ppc_fp128 [[TMP61]], [[TMP62]]
-; CHECK-PWR8-NEXT: [[SUB_7:%.*]] = fsub fast ppc_fp128 [[BIN_RDX32]], [[TMP63]]
-; CHECK-PWR8-NEXT: ret ppc_fp128 [[SUB_7]]
-;
-; CHECK-PWR9-LABEL: @fp128_(
-; CHECK-PWR9-NEXT: entry:
-; CHECK-PWR9-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-PWR9: vector.body:
-; CHECK-PWR9-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR9-NEXT: [[VEC_PHI:%.*]] = phi ppc_fp128 [ [[D:%.*]], [[ENTRY]] ], [ [[TMP36:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR9-NEXT: [[VEC_PHI1:%.*]] = phi ppc_fp128 [ 0xM00000000000000000000000000000000, [[ENTRY]] ], [ [[TMP37:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR9-NEXT: [[VEC_PHI2:%.*]] = phi ppc_fp128 [ 0xM00000000000000000000000000000000, [[ENTRY]] ], [ [[TMP38:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR9-NEXT: [[VEC_PHI3:%.*]] = phi ppc_fp128 [ 0xM00000000000000000000000000000000, [[ENTRY]] ], [ [[TMP39:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR9-NEXT: [[VEC_PHI4:%.*]] = phi ppc_fp128 [ 0xM00000000000000000000000000000000, [[ENTRY]] ], [ [[TMP40:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR9-NEXT: [[VEC_PHI5:%.*]] = phi ppc_fp128 [ 0xM00000000000000000000000000000000, [[ENTRY]] ], [ [[TMP41:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR9-NEXT: [[VEC_PHI6:%.*]] = phi ppc_fp128 [ 0xM00000000000000000000000000000000, [[ENTRY]] ], [ [[TMP42:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR9-NEXT: [[VEC_PHI7:%.*]] = phi ppc_fp128 [ 0xM00000000000000000000000000000000, [[ENTRY]] ], [ [[TMP43:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR9-NEXT: [[VEC_PHI8:%.*]] = phi ppc_fp128 [ 0xM00000000000000000000000000000000, [[ENTRY]] ], [ [[TMP44:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR9-NEXT: [[VEC_PHI9:%.*]] = phi ppc_fp128 [ 0xM00000000000000000000000000000000, [[ENTRY]] ], [ [[TMP45:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR9-NEXT: [[VEC_PHI10:%.*]] = phi ppc_fp128 [ 0xM00000000000000000000000000000000, [[ENTRY]] ], [ [[TMP46:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR9-NEXT: [[VEC_PHI11:%.*]] = phi ppc_fp128 [ 0xM00000000000000000000000000000000, [[ENTRY]] ], [ [[TMP47:%.*]], [[VECTOR_BODY]] ]
-; CHECK-PWR9-NEXT: [[INDUCTION12:%.*]] = or i32 [[INDEX]], 1
-; CHECK-PWR9-NEXT: [[INDUCTION13:%.*]] = or i32 [[INDEX]], 2
-; CHECK-PWR9-NEXT: [[INDUCTION14:%.*]] = or i32 [[INDEX]], 3
-; CHECK-PWR9-NEXT: [[INDUCTION15:%.*]] = add nuw nsw i32 [[INDEX]], 4
-; CHECK-PWR9-NEXT: [[INDUCTION16:%.*]] = add nuw nsw i32 [[INDEX]], 5
-; CHECK-PWR9-NEXT: [[INDUCTION17:%.*]] = add nuw nsw i32 [[INDEX]], 6
-; CHECK-PWR9-NEXT: [[INDUCTION18:%.*]] = add nuw nsw i32 [[INDEX]], 7
-; CHECK-PWR9-NEXT: [[INDUCTION19:%.*]] = add nuw nsw i32 [[INDEX]], 8
-; CHECK-PWR9-NEXT: [[INDUCTION20:%.*]] = add nuw nsw i32 [[INDEX]], 9
-; CHECK-PWR9-NEXT: [[INDUCTION21:%.*]] = add nuw nsw i32 [[INDEX]], 10
-; CHECK-PWR9-NEXT: [[INDUCTION22:%.*]] = add nuw nsw i32 [[INDEX]], 11
-; CHECK-PWR9-NEXT: [[TMP0:%.*]] = zext i32 [[INDEX]] to i64
-; CHECK-PWR9-NEXT: [[TMP1:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N:%.*]], i64 [[TMP0]]
-; CHECK-PWR9-NEXT: [[TMP2:%.*]] = zext i32 [[INDUCTION12]] to i64
-; CHECK-PWR9-NEXT: [[TMP3:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 [[TMP2]]
-; CHECK-PWR9-NEXT: [[TMP4:%.*]] = zext i32 [[INDUCTION13]] to i64
-; CHECK-PWR9-NEXT: [[TMP5:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 [[TMP4]]
-; CHECK-PWR9-NEXT: [[TMP6:%.*]] = zext i32 [[INDUCTION14]] to i64
-; CHECK-PWR9-NEXT: [[TMP7:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 [[TMP6]]
-; CHECK-PWR9-NEXT: [[TMP8:%.*]] = zext i32 [[INDUCTION15]] to i64
-; CHECK-PWR9-NEXT: [[TMP9:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 [[TMP8]]
-; CHECK-PWR9-NEXT: [[TMP10:%.*]] = zext i32 [[INDUCTION16]] to i64
-; CHECK-PWR9-NEXT: [[TMP11:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 [[TMP10]]
-; CHECK-PWR9-NEXT: [[TMP12:%.*]] = zext i32 [[INDUCTION17]] to i64
-; CHECK-PWR9-NEXT: [[TMP13:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 [[TMP12]]
-; CHECK-PWR9-NEXT: [[TMP14:%.*]] = zext i32 [[INDUCTION18]] to i64
-; CHECK-PWR9-NEXT: [[TMP15:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 [[TMP14]]
-; CHECK-PWR9-NEXT: [[TMP16:%.*]] = zext i32 [[INDUCTION19]] to i64
-; CHECK-PWR9-NEXT: [[TMP17:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 [[TMP16]]
-; CHECK-PWR9-NEXT: [[TMP18:%.*]] = zext i32 [[INDUCTION20]] to i64
-; CHECK-PWR9-NEXT: [[TMP19:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 [[TMP18]]
-; CHECK-PWR9-NEXT: [[TMP20:%.*]] = zext i32 [[INDUCTION21]] to i64
-; CHECK-PWR9-NEXT: [[TMP21:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 [[TMP20]]
-; CHECK-PWR9-NEXT: [[TMP22:%.*]] = zext i32 [[INDUCTION22]] to i64
-; CHECK-PWR9-NEXT: [[TMP23:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 [[TMP22]]
-; CHECK-PWR9-NEXT: [[TMP24:%.*]] = load ppc_fp128, ppc_fp128* [[TMP1]], align 8
-; CHECK-PWR9-NEXT: [[TMP25:%.*]] = load ppc_fp128, ppc_fp128* [[TMP3]], align 8
-; CHECK-PWR9-NEXT: [[TMP26:%.*]] = load ppc_fp128, ppc_fp128* [[TMP5]], align 8
-; CHECK-PWR9-NEXT: [[TMP27:%.*]] = load ppc_fp128, ppc_fp128* [[TMP7]], align 8
-; CHECK-PWR9-NEXT: [[TMP28:%.*]] = load ppc_fp128, ppc_fp128* [[TMP9]], align 8
-; CHECK-PWR9-NEXT: [[TMP29:%.*]] = load ppc_fp128, ppc_fp128* [[TMP11]], align 8
-; CHECK-PWR9-NEXT: [[TMP30:%.*]] = load ppc_fp128, ppc_fp128* [[TMP13]], align 8
-; CHECK-PWR9-NEXT: [[TMP31:%.*]] = load ppc_fp128, ppc_fp128* [[TMP15]], align 8
-; CHECK-PWR9-NEXT: [[TMP32:%.*]] = load ppc_fp128, ppc_fp128* [[TMP17]], align 8
-; CHECK-PWR9-NEXT: [[TMP33:%.*]] = load ppc_fp128, ppc_fp128* [[TMP19]], align 8
-; CHECK-PWR9-NEXT: [[TMP34:%.*]] = load ppc_fp128, ppc_fp128* [[TMP21]], align 8
-; CHECK-PWR9-NEXT: [[TMP35:%.*]] = load ppc_fp128, ppc_fp128* [[TMP23]], align 8
-; CHECK-PWR9-NEXT: [[TMP36]] = fsub fast ppc_fp128 [[VEC_PHI]], [[TMP24]]
-; CHECK-PWR9-NEXT: [[TMP37]] = fsub fast ppc_fp128 [[VEC_PHI1]], [[TMP25]]
-; CHECK-PWR9-NEXT: [[TMP38]] = fsub fast ppc_fp128 [[VEC_PHI2]], [[TMP26]]
-; CHECK-PWR9-NEXT: [[TMP39]] = fsub fast ppc_fp128 [[VEC_PHI3]], [[TMP27]]
-; CHECK-PWR9-NEXT: [[TMP40]] = fsub fast ppc_fp128 [[VEC_PHI4]], [[TMP28]]
-; CHECK-PWR9-NEXT: [[TMP41]] = fsub fast ppc_fp128 [[VEC_PHI5]], [[TMP29]]
-; CHECK-PWR9-NEXT: [[TMP42]] = fsub fast ppc_fp128 [[VEC_PHI6]], [[TMP30]]
-; CHECK-PWR9-NEXT: [[TMP43]] = fsub fast ppc_fp128 [[VEC_PHI7]], [[TMP31]]
-; CHECK-PWR9-NEXT: [[TMP44]] = fsub fast ppc_fp128 [[VEC_PHI8]], [[TMP32]]
-; CHECK-PWR9-NEXT: [[TMP45]] = fsub fast ppc_fp128 [[VEC_PHI9]], [[TMP33]]
-; CHECK-PWR9-NEXT: [[TMP46]] = fsub fast ppc_fp128 [[VEC_PHI10]], [[TMP34]]
-; CHECK-PWR9-NEXT: [[TMP47]] = fsub fast ppc_fp128 [[VEC_PHI11]], [[TMP35]]
-; CHECK-PWR9-NEXT: [[INDEX_NEXT]] = add nuw nsw i32 [[INDEX]], 12
-; CHECK-PWR9-NEXT: [[TMP48:%.*]] = icmp eq i32 [[INDEX_NEXT]], 2040
-; CHECK-PWR9-NEXT: br i1 [[TMP48]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK-PWR9: middle.block:
-; CHECK-PWR9-NEXT: [[BIN_RDX:%.*]] = fadd fast ppc_fp128 [[TMP37]], [[TMP36]]
-; CHECK-PWR9-NEXT: [[BIN_RDX23:%.*]] = fadd fast ppc_fp128 [[BIN_RDX]], [[TMP38]]
-; CHECK-PWR9-NEXT: [[BIN_RDX24:%.*]] = fadd fast ppc_fp128 [[BIN_RDX23]], [[TMP39]]
-; CHECK-PWR9-NEXT: [[BIN_RDX25:%.*]] = fadd fast ppc_fp128 [[BIN_RDX24]], [[TMP40]]
-; CHECK-PWR9-NEXT: [[BIN_RDX26:%.*]] = fadd fast ppc_fp128 [[BIN_RDX25]], [[TMP41]]
-; CHECK-PWR9-NEXT: [[BIN_RDX27:%.*]] = fadd fast ppc_fp128 [[BIN_RDX26]], [[TMP42]]
-; CHECK-PWR9-NEXT: [[BIN_RDX28:%.*]] = fadd fast ppc_fp128 [[BIN_RDX27]], [[TMP43]]
-; CHECK-PWR9-NEXT: [[BIN_RDX29:%.*]] = fadd fast ppc_fp128 [[BIN_RDX28]], [[TMP44]]
-; CHECK-PWR9-NEXT: [[BIN_RDX30:%.*]] = fadd fast ppc_fp128 [[BIN_RDX29]], [[TMP45]]
-; CHECK-PWR9-NEXT: [[BIN_RDX31:%.*]] = fadd fast ppc_fp128 [[BIN_RDX30]], [[TMP46]]
-; CHECK-PWR9-NEXT: [[BIN_RDX32:%.*]] = fadd fast ppc_fp128 [[BIN_RDX31]], [[TMP47]]
-; CHECK-PWR9-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 2040
-; CHECK-PWR9-NEXT: [[TMP49:%.*]] = load ppc_fp128, ppc_fp128* [[ARRAYIDX]], align 8
-; CHECK-PWR9-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 2041
-; CHECK-PWR9-NEXT: [[TMP50:%.*]] = load ppc_fp128, ppc_fp128* [[ARRAYIDX_1]], align 8
-; CHECK-PWR9-NEXT: [[TMP51:%.*]] = fadd fast ppc_fp128 [[TMP49]], [[TMP50]]
-; CHECK-PWR9-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 2042
-; CHECK-PWR9-NEXT: [[TMP52:%.*]] = load ppc_fp128, ppc_fp128* [[ARRAYIDX_2]], align 8
-; CHECK-PWR9-NEXT: [[TMP53:%.*]] = fadd fast ppc_fp128 [[TMP51]], [[TMP52]]
-; CHECK-PWR9-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 2043
-; CHECK-PWR9-NEXT: [[TMP54:%.*]] = load ppc_fp128, ppc_fp128* [[ARRAYIDX_3]], align 8
-; CHECK-PWR9-NEXT: [[TMP55:%.*]] = fadd fast ppc_fp128 [[TMP53]], [[TMP54]]
-; CHECK-PWR9-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 2044
-; CHECK-PWR9-NEXT: [[TMP56:%.*]] = load ppc_fp128, ppc_fp128* [[ARRAYIDX_4]], align 8
-; CHECK-PWR9-NEXT: [[TMP57:%.*]] = fadd fast ppc_fp128 [[TMP55]], [[TMP56]]
-; CHECK-PWR9-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 2045
-; CHECK-PWR9-NEXT: [[TMP58:%.*]] = load ppc_fp128, ppc_fp128* [[ARRAYIDX_5]], align 8
-; CHECK-PWR9-NEXT: [[TMP59:%.*]] = fadd fast ppc_fp128 [[TMP57]], [[TMP58]]
-; CHECK-PWR9-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 2046
-; CHECK-PWR9-NEXT: [[TMP60:%.*]] = load ppc_fp128, ppc_fp128* [[ARRAYIDX_6]], align 8
-; CHECK-PWR9-NEXT: [[TMP61:%.*]] = fadd fast ppc_fp128 [[TMP59]], [[TMP60]]
-; CHECK-PWR9-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds ppc_fp128, ppc_fp128* [[N]], i64 2047
-; CHECK-PWR9-NEXT: [[TMP62:%.*]] = load ppc_fp128, ppc_fp128* [[ARRAYIDX_7]], align 8
-; CHECK-PWR9-NEXT: [[TMP63:%.*]] = fadd fast ppc_fp128 [[TMP61]], [[TMP62]]
-; CHECK-PWR9-NEXT: [[SUB_7:%.*]] = fsub fast ppc_fp128 [[BIN_RDX32]], [[TMP63]]
-; CHECK-PWR9-NEXT: ret ppc_fp128 [[SUB_7]]
-;
+;CHECK-LABEL: fp128_
+;CHECK: LV(REG): VF = 1
+;CHECK: LV(REG): Found max usage: 2 item
+;CHECK: LV(REG): RegisterClass: PPC::GPRRC, 2 registers
+;CHECK: LV(REG): RegisterClass: PPC::VRRC, 2 registers
entry:
br label %for.body
define void @fp16_(half* nocapture readonly %pIn, half* nocapture %pOut, i32 %numRows, i32 %numCols, i32 %scale.coerce) #0 {
-; CHECK-PWR8-LABEL: @fp16_(
-; CHECK-PWR8-NEXT: entry:
-; CHECK-PWR8-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[SCALE_COERCE:%.*]] to i16
-; CHECK-PWR8-NEXT: [[TMP0:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-; CHECK-PWR8-NEXT: [[MUL:%.*]] = mul i32 [[NUMCOLS:%.*]], [[NUMROWS:%.*]]
-; CHECK-PWR8-NEXT: [[CMP26:%.*]] = icmp ult i32 [[MUL]], 4
-; CHECK-PWR8-NEXT: br i1 [[CMP26]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
-; CHECK-PWR8: while.body.preheader:
-; CHECK-PWR8-NEXT: [[SHR:%.*]] = lshr i32 [[MUL]], 2
-; CHECK-PWR8-NEXT: [[TMP1:%.*]] = add nsw i32 [[SHR]], -1
-; CHECK-PWR8-NEXT: [[XTRAITER:%.*]] = and i32 [[SHR]], 7
-; CHECK-PWR8-NEXT: [[LCMP_MOD_NOT:%.*]] = icmp eq i32 [[XTRAITER]], 0
-; CHECK-PWR8-NEXT: br i1 [[LCMP_MOD_NOT]], label [[WHILE_BODY_PROL_LOOPEXIT:%.*]], label [[WHILE_BODY_PROL:%.*]]
-; CHECK-PWR8: while.body.prol:
-; CHECK-PWR8-NEXT: [[PIN_ADDR_029_PROL:%.*]] = phi half* [ [[ADD_PTR_PROL:%.*]], [[WHILE_BODY_PROL]] ], [ [[PIN:%.*]], [[WHILE_BODY_PREHEADER]] ]
-; CHECK-PWR8-NEXT: [[POUT_ADDR_028_PROL:%.*]] = phi half* [ [[ADD_PTR7_PROL:%.*]], [[WHILE_BODY_PROL]] ], [ [[POUT:%.*]], [[WHILE_BODY_PREHEADER]] ]
-; CHECK-PWR8-NEXT: [[BLKCNT_027_PROL:%.*]] = phi i32 [ [[DEC_PROL:%.*]], [[WHILE_BODY_PROL]] ], [ [[SHR]], [[WHILE_BODY_PREHEADER]] ]
-; CHECK-PWR8-NEXT: [[PROL_ITER:%.*]] = phi i32 [ [[PROL_ITER_NEXT:%.*]], [[WHILE_BODY_PROL]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
-; CHECK-PWR8-NEXT: [[TMP2:%.*]] = load half, half* [[PIN_ADDR_029_PROL]], align 2
-; CHECK-PWR8-NEXT: [[ARRAYIDX2_PROL:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029_PROL]], i64 1
-; CHECK-PWR8-NEXT: [[TMP3:%.*]] = load half, half* [[ARRAYIDX2_PROL]], align 2
-; CHECK-PWR8-NEXT: [[MUL3_PROL:%.*]] = fmul half [[TMP2]], [[TMP0]]
-; CHECK-PWR8-NEXT: [[MUL4_PROL:%.*]] = fmul half [[TMP3]], [[TMP0]]
-; CHECK-PWR8-NEXT: store half [[MUL3_PROL]], half* [[POUT_ADDR_028_PROL]], align 2
-; CHECK-PWR8-NEXT: [[ARRAYIDX6_PROL:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028_PROL]], i64 1
-; CHECK-PWR8-NEXT: store half [[MUL4_PROL]], half* [[ARRAYIDX6_PROL]], align 2
-; CHECK-PWR8-NEXT: [[ADD_PTR_PROL]] = getelementptr inbounds half, half* [[PIN_ADDR_029_PROL]], i64 2
-; CHECK-PWR8-NEXT: [[ADD_PTR7_PROL]] = getelementptr inbounds half, half* [[POUT_ADDR_028_PROL]], i64 2
-; CHECK-PWR8-NEXT: [[DEC_PROL]] = add nsw i32 [[BLKCNT_027_PROL]], -1
-; CHECK-PWR8-NEXT: [[PROL_ITER_NEXT]] = add i32 [[PROL_ITER]], 1
-; CHECK-PWR8-NEXT: [[PROL_ITER_CMP_NOT:%.*]] = icmp eq i32 [[PROL_ITER_NEXT]], [[XTRAITER]]
-; CHECK-PWR8-NEXT: br i1 [[PROL_ITER_CMP_NOT]], label [[WHILE_BODY_PROL_LOOPEXIT]], label [[WHILE_BODY_PROL]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK-PWR8: while.body.prol.loopexit:
-; CHECK-PWR8-NEXT: [[PIN_ADDR_029_UNR:%.*]] = phi half* [ [[PIN]], [[WHILE_BODY_PREHEADER]] ], [ [[ADD_PTR_PROL]], [[WHILE_BODY_PROL]] ]
-; CHECK-PWR8-NEXT: [[POUT_ADDR_028_UNR:%.*]] = phi half* [ [[POUT]], [[WHILE_BODY_PREHEADER]] ], [ [[ADD_PTR7_PROL]], [[WHILE_BODY_PROL]] ]
-; CHECK-PWR8-NEXT: [[BLKCNT_027_UNR:%.*]] = phi i32 [ [[SHR]], [[WHILE_BODY_PREHEADER]] ], [ [[DEC_PROL]], [[WHILE_BODY_PROL]] ]
-; CHECK-PWR8-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP1]], 7
-; CHECK-PWR8-NEXT: br i1 [[TMP4]], label [[WHILE_END]], label [[WHILE_BODY:%.*]]
-; CHECK-PWR8: while.body:
-; CHECK-PWR8-NEXT: [[PIN_ADDR_029:%.*]] = phi half* [ [[ADD_PTR_7:%.*]], [[WHILE_BODY]] ], [ [[PIN_ADDR_029_UNR]], [[WHILE_BODY_PROL_LOOPEXIT]] ]
-; CHECK-PWR8-NEXT: [[POUT_ADDR_028:%.*]] = phi half* [ [[ADD_PTR7_7:%.*]], [[WHILE_BODY]] ], [ [[POUT_ADDR_028_UNR]], [[WHILE_BODY_PROL_LOOPEXIT]] ]
-; CHECK-PWR8-NEXT: [[BLKCNT_027:%.*]] = phi i32 [ [[DEC_7:%.*]], [[WHILE_BODY]] ], [ [[BLKCNT_027_UNR]], [[WHILE_BODY_PROL_LOOPEXIT]] ]
-; CHECK-PWR8-NEXT: [[TMP5:%.*]] = load half, half* [[PIN_ADDR_029]], align 2
-; CHECK-PWR8-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 1
-; CHECK-PWR8-NEXT: [[TMP6:%.*]] = load half, half* [[ARRAYIDX2]], align 2
-; CHECK-PWR8-NEXT: [[MUL3:%.*]] = fmul half [[TMP5]], [[TMP0]]
-; CHECK-PWR8-NEXT: [[MUL4:%.*]] = fmul half [[TMP6]], [[TMP0]]
-; CHECK-PWR8-NEXT: store half [[MUL3]], half* [[POUT_ADDR_028]], align 2
-; CHECK-PWR8-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 1
-; CHECK-PWR8-NEXT: store half [[MUL4]], half* [[ARRAYIDX6]], align 2
-; CHECK-PWR8-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 2
-; CHECK-PWR8-NEXT: [[ADD_PTR7:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 2
-; CHECK-PWR8-NEXT: [[TMP7:%.*]] = load half, half* [[ADD_PTR]], align 2
-; CHECK-PWR8-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 3
-; CHECK-PWR8-NEXT: [[TMP8:%.*]] = load half, half* [[ARRAYIDX2_1]], align 2
-; CHECK-PWR8-NEXT: [[MUL3_1:%.*]] = fmul half [[TMP7]], [[TMP0]]
-; CHECK-PWR8-NEXT: [[MUL4_1:%.*]] = fmul half [[TMP8]], [[TMP0]]
-; CHECK-PWR8-NEXT: store half [[MUL3_1]], half* [[ADD_PTR7]], align 2
-; CHECK-PWR8-NEXT: [[ARRAYIDX6_1:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 3
-; CHECK-PWR8-NEXT: store half [[MUL4_1]], half* [[ARRAYIDX6_1]], align 2
-; CHECK-PWR8-NEXT: [[ADD_PTR_1:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 4
-; CHECK-PWR8-NEXT: [[ADD_PTR7_1:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 4
-; CHECK-PWR8-NEXT: [[TMP9:%.*]] = load half, half* [[ADD_PTR_1]], align 2
-; CHECK-PWR8-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 5
-; CHECK-PWR8-NEXT: [[TMP10:%.*]] = load half, half* [[ARRAYIDX2_2]], align 2
-; CHECK-PWR8-NEXT: [[MUL3_2:%.*]] = fmul half [[TMP9]], [[TMP0]]
-; CHECK-PWR8-NEXT: [[MUL4_2:%.*]] = fmul half [[TMP10]], [[TMP0]]
-; CHECK-PWR8-NEXT: store half [[MUL3_2]], half* [[ADD_PTR7_1]], align 2
-; CHECK-PWR8-NEXT: [[ARRAYIDX6_2:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 5
-; CHECK-PWR8-NEXT: store half [[MUL4_2]], half* [[ARRAYIDX6_2]], align 2
-; CHECK-PWR8-NEXT: [[ADD_PTR_2:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 6
-; CHECK-PWR8-NEXT: [[ADD_PTR7_2:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 6
-; CHECK-PWR8-NEXT: [[TMP11:%.*]] = load half, half* [[ADD_PTR_2]], align 2
-; CHECK-PWR8-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 7
-; CHECK-PWR8-NEXT: [[TMP12:%.*]] = load half, half* [[ARRAYIDX2_3]], align 2
-; CHECK-PWR8-NEXT: [[MUL3_3:%.*]] = fmul half [[TMP11]], [[TMP0]]
-; CHECK-PWR8-NEXT: [[MUL4_3:%.*]] = fmul half [[TMP12]], [[TMP0]]
-; CHECK-PWR8-NEXT: store half [[MUL3_3]], half* [[ADD_PTR7_2]], align 2
-; CHECK-PWR8-NEXT: [[ARRAYIDX6_3:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 7
-; CHECK-PWR8-NEXT: store half [[MUL4_3]], half* [[ARRAYIDX6_3]], align 2
-; CHECK-PWR8-NEXT: [[ADD_PTR_3:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 8
-; CHECK-PWR8-NEXT: [[ADD_PTR7_3:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 8
-; CHECK-PWR8-NEXT: [[TMP13:%.*]] = load half, half* [[ADD_PTR_3]], align 2
-; CHECK-PWR8-NEXT: [[ARRAYIDX2_4:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 9
-; CHECK-PWR8-NEXT: [[TMP14:%.*]] = load half, half* [[ARRAYIDX2_4]], align 2
-; CHECK-PWR8-NEXT: [[MUL3_4:%.*]] = fmul half [[TMP13]], [[TMP0]]
-; CHECK-PWR8-NEXT: [[MUL4_4:%.*]] = fmul half [[TMP14]], [[TMP0]]
-; CHECK-PWR8-NEXT: store half [[MUL3_4]], half* [[ADD_PTR7_3]], align 2
-; CHECK-PWR8-NEXT: [[ARRAYIDX6_4:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 9
-; CHECK-PWR8-NEXT: store half [[MUL4_4]], half* [[ARRAYIDX6_4]], align 2
-; CHECK-PWR8-NEXT: [[ADD_PTR_4:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 10
-; CHECK-PWR8-NEXT: [[ADD_PTR7_4:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 10
-; CHECK-PWR8-NEXT: [[TMP15:%.*]] = load half, half* [[ADD_PTR_4]], align 2
-; CHECK-PWR8-NEXT: [[ARRAYIDX2_5:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 11
-; CHECK-PWR8-NEXT: [[TMP16:%.*]] = load half, half* [[ARRAYIDX2_5]], align 2
-; CHECK-PWR8-NEXT: [[MUL3_5:%.*]] = fmul half [[TMP15]], [[TMP0]]
-; CHECK-PWR8-NEXT: [[MUL4_5:%.*]] = fmul half [[TMP16]], [[TMP0]]
-; CHECK-PWR8-NEXT: store half [[MUL3_5]], half* [[ADD_PTR7_4]], align 2
-; CHECK-PWR8-NEXT: [[ARRAYIDX6_5:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 11
-; CHECK-PWR8-NEXT: store half [[MUL4_5]], half* [[ARRAYIDX6_5]], align 2
-; CHECK-PWR8-NEXT: [[ADD_PTR_5:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 12
-; CHECK-PWR8-NEXT: [[ADD_PTR7_5:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 12
-; CHECK-PWR8-NEXT: [[TMP17:%.*]] = load half, half* [[ADD_PTR_5]], align 2
-; CHECK-PWR8-NEXT: [[ARRAYIDX2_6:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 13
-; CHECK-PWR8-NEXT: [[TMP18:%.*]] = load half, half* [[ARRAYIDX2_6]], align 2
-; CHECK-PWR8-NEXT: [[MUL3_6:%.*]] = fmul half [[TMP17]], [[TMP0]]
-; CHECK-PWR8-NEXT: [[MUL4_6:%.*]] = fmul half [[TMP18]], [[TMP0]]
-; CHECK-PWR8-NEXT: store half [[MUL3_6]], half* [[ADD_PTR7_5]], align 2
-; CHECK-PWR8-NEXT: [[ARRAYIDX6_6:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 13
-; CHECK-PWR8-NEXT: store half [[MUL4_6]], half* [[ARRAYIDX6_6]], align 2
-; CHECK-PWR8-NEXT: [[ADD_PTR_6:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 14
-; CHECK-PWR8-NEXT: [[ADD_PTR7_6:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 14
-; CHECK-PWR8-NEXT: [[TMP19:%.*]] = load half, half* [[ADD_PTR_6]], align 2
-; CHECK-PWR8-NEXT: [[ARRAYIDX2_7:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 15
-; CHECK-PWR8-NEXT: [[TMP20:%.*]] = load half, half* [[ARRAYIDX2_7]], align 2
-; CHECK-PWR8-NEXT: [[MUL3_7:%.*]] = fmul half [[TMP19]], [[TMP0]]
-; CHECK-PWR8-NEXT: [[MUL4_7:%.*]] = fmul half [[TMP20]], [[TMP0]]
-; CHECK-PWR8-NEXT: store half [[MUL3_7]], half* [[ADD_PTR7_6]], align 2
-; CHECK-PWR8-NEXT: [[ARRAYIDX6_7:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 15
-; CHECK-PWR8-NEXT: store half [[MUL4_7]], half* [[ARRAYIDX6_7]], align 2
-; CHECK-PWR8-NEXT: [[ADD_PTR_7]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 16
-; CHECK-PWR8-NEXT: [[ADD_PTR7_7]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 16
-; CHECK-PWR8-NEXT: [[DEC_7]] = add nsw i32 [[BLKCNT_027]], -8
-; CHECK-PWR8-NEXT: [[CMP_7:%.*]] = icmp eq i32 [[DEC_7]], 0
-; CHECK-PWR8-NEXT: br i1 [[CMP_7]], label [[WHILE_END]], label [[WHILE_BODY]]
-; CHECK-PWR8: while.end:
-; CHECK-PWR8-NEXT: ret void
-;
-; CHECK-PWR9-LABEL: @fp16_(
-; CHECK-PWR9-NEXT: entry:
-; CHECK-PWR9-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[SCALE_COERCE:%.*]] to i16
-; CHECK-PWR9-NEXT: [[TMP0:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half
-; CHECK-PWR9-NEXT: [[MUL:%.*]] = mul i32 [[NUMCOLS:%.*]], [[NUMROWS:%.*]]
-; CHECK-PWR9-NEXT: [[CMP26:%.*]] = icmp ult i32 [[MUL]], 4
-; CHECK-PWR9-NEXT: br i1 [[CMP26]], label [[WHILE_END:%.*]], label [[WHILE_BODY_PREHEADER:%.*]]
-; CHECK-PWR9: while.body.preheader:
-; CHECK-PWR9-NEXT: [[SHR:%.*]] = lshr i32 [[MUL]], 2
-; CHECK-PWR9-NEXT: [[TMP1:%.*]] = add nsw i32 [[SHR]], -1
-; CHECK-PWR9-NEXT: [[XTRAITER:%.*]] = and i32 [[SHR]], 7
-; CHECK-PWR9-NEXT: [[LCMP_MOD_NOT:%.*]] = icmp eq i32 [[XTRAITER]], 0
-; CHECK-PWR9-NEXT: br i1 [[LCMP_MOD_NOT]], label [[WHILE_BODY_PROL_LOOPEXIT:%.*]], label [[WHILE_BODY_PROL:%.*]]
-; CHECK-PWR9: while.body.prol:
-; CHECK-PWR9-NEXT: [[PIN_ADDR_029_PROL:%.*]] = phi half* [ [[ADD_PTR_PROL:%.*]], [[WHILE_BODY_PROL]] ], [ [[PIN:%.*]], [[WHILE_BODY_PREHEADER]] ]
-; CHECK-PWR9-NEXT: [[POUT_ADDR_028_PROL:%.*]] = phi half* [ [[ADD_PTR7_PROL:%.*]], [[WHILE_BODY_PROL]] ], [ [[POUT:%.*]], [[WHILE_BODY_PREHEADER]] ]
-; CHECK-PWR9-NEXT: [[BLKCNT_027_PROL:%.*]] = phi i32 [ [[DEC_PROL:%.*]], [[WHILE_BODY_PROL]] ], [ [[SHR]], [[WHILE_BODY_PREHEADER]] ]
-; CHECK-PWR9-NEXT: [[PROL_ITER:%.*]] = phi i32 [ [[PROL_ITER_NEXT:%.*]], [[WHILE_BODY_PROL]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
-; CHECK-PWR9-NEXT: [[TMP2:%.*]] = load half, half* [[PIN_ADDR_029_PROL]], align 2
-; CHECK-PWR9-NEXT: [[ARRAYIDX2_PROL:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029_PROL]], i64 1
-; CHECK-PWR9-NEXT: [[TMP3:%.*]] = load half, half* [[ARRAYIDX2_PROL]], align 2
-; CHECK-PWR9-NEXT: [[MUL3_PROL:%.*]] = fmul half [[TMP2]], [[TMP0]]
-; CHECK-PWR9-NEXT: [[MUL4_PROL:%.*]] = fmul half [[TMP3]], [[TMP0]]
-; CHECK-PWR9-NEXT: store half [[MUL3_PROL]], half* [[POUT_ADDR_028_PROL]], align 2
-; CHECK-PWR9-NEXT: [[ARRAYIDX6_PROL:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028_PROL]], i64 1
-; CHECK-PWR9-NEXT: store half [[MUL4_PROL]], half* [[ARRAYIDX6_PROL]], align 2
-; CHECK-PWR9-NEXT: [[ADD_PTR_PROL]] = getelementptr inbounds half, half* [[PIN_ADDR_029_PROL]], i64 2
-; CHECK-PWR9-NEXT: [[ADD_PTR7_PROL]] = getelementptr inbounds half, half* [[POUT_ADDR_028_PROL]], i64 2
-; CHECK-PWR9-NEXT: [[DEC_PROL]] = add nsw i32 [[BLKCNT_027_PROL]], -1
-; CHECK-PWR9-NEXT: [[PROL_ITER_NEXT]] = add i32 [[PROL_ITER]], 1
-; CHECK-PWR9-NEXT: [[PROL_ITER_CMP_NOT:%.*]] = icmp eq i32 [[PROL_ITER_NEXT]], [[XTRAITER]]
-; CHECK-PWR9-NEXT: br i1 [[PROL_ITER_CMP_NOT]], label [[WHILE_BODY_PROL_LOOPEXIT]], label [[WHILE_BODY_PROL]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK-PWR9: while.body.prol.loopexit:
-; CHECK-PWR9-NEXT: [[PIN_ADDR_029_UNR:%.*]] = phi half* [ [[PIN]], [[WHILE_BODY_PREHEADER]] ], [ [[ADD_PTR_PROL]], [[WHILE_BODY_PROL]] ]
-; CHECK-PWR9-NEXT: [[POUT_ADDR_028_UNR:%.*]] = phi half* [ [[POUT]], [[WHILE_BODY_PREHEADER]] ], [ [[ADD_PTR7_PROL]], [[WHILE_BODY_PROL]] ]
-; CHECK-PWR9-NEXT: [[BLKCNT_027_UNR:%.*]] = phi i32 [ [[SHR]], [[WHILE_BODY_PREHEADER]] ], [ [[DEC_PROL]], [[WHILE_BODY_PROL]] ]
-; CHECK-PWR9-NEXT: [[TMP4:%.*]] = icmp ult i32 [[TMP1]], 7
-; CHECK-PWR9-NEXT: br i1 [[TMP4]], label [[WHILE_END]], label [[WHILE_BODY:%.*]]
-; CHECK-PWR9: while.body:
-; CHECK-PWR9-NEXT: [[PIN_ADDR_029:%.*]] = phi half* [ [[ADD_PTR_7:%.*]], [[WHILE_BODY]] ], [ [[PIN_ADDR_029_UNR]], [[WHILE_BODY_PROL_LOOPEXIT]] ]
-; CHECK-PWR9-NEXT: [[POUT_ADDR_028:%.*]] = phi half* [ [[ADD_PTR7_7:%.*]], [[WHILE_BODY]] ], [ [[POUT_ADDR_028_UNR]], [[WHILE_BODY_PROL_LOOPEXIT]] ]
-; CHECK-PWR9-NEXT: [[BLKCNT_027:%.*]] = phi i32 [ [[DEC_7:%.*]], [[WHILE_BODY]] ], [ [[BLKCNT_027_UNR]], [[WHILE_BODY_PROL_LOOPEXIT]] ]
-; CHECK-PWR9-NEXT: [[TMP5:%.*]] = load half, half* [[PIN_ADDR_029]], align 2
-; CHECK-PWR9-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 1
-; CHECK-PWR9-NEXT: [[TMP6:%.*]] = load half, half* [[ARRAYIDX2]], align 2
-; CHECK-PWR9-NEXT: [[MUL3:%.*]] = fmul half [[TMP5]], [[TMP0]]
-; CHECK-PWR9-NEXT: [[MUL4:%.*]] = fmul half [[TMP6]], [[TMP0]]
-; CHECK-PWR9-NEXT: store half [[MUL3]], half* [[POUT_ADDR_028]], align 2
-; CHECK-PWR9-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 1
-; CHECK-PWR9-NEXT: store half [[MUL4]], half* [[ARRAYIDX6]], align 2
-; CHECK-PWR9-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 2
-; CHECK-PWR9-NEXT: [[ADD_PTR7:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 2
-; CHECK-PWR9-NEXT: [[TMP7:%.*]] = load half, half* [[ADD_PTR]], align 2
-; CHECK-PWR9-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 3
-; CHECK-PWR9-NEXT: [[TMP8:%.*]] = load half, half* [[ARRAYIDX2_1]], align 2
-; CHECK-PWR9-NEXT: [[MUL3_1:%.*]] = fmul half [[TMP7]], [[TMP0]]
-; CHECK-PWR9-NEXT: [[MUL4_1:%.*]] = fmul half [[TMP8]], [[TMP0]]
-; CHECK-PWR9-NEXT: store half [[MUL3_1]], half* [[ADD_PTR7]], align 2
-; CHECK-PWR9-NEXT: [[ARRAYIDX6_1:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 3
-; CHECK-PWR9-NEXT: store half [[MUL4_1]], half* [[ARRAYIDX6_1]], align 2
-; CHECK-PWR9-NEXT: [[ADD_PTR_1:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 4
-; CHECK-PWR9-NEXT: [[ADD_PTR7_1:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 4
-; CHECK-PWR9-NEXT: [[TMP9:%.*]] = load half, half* [[ADD_PTR_1]], align 2
-; CHECK-PWR9-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 5
-; CHECK-PWR9-NEXT: [[TMP10:%.*]] = load half, half* [[ARRAYIDX2_2]], align 2
-; CHECK-PWR9-NEXT: [[MUL3_2:%.*]] = fmul half [[TMP9]], [[TMP0]]
-; CHECK-PWR9-NEXT: [[MUL4_2:%.*]] = fmul half [[TMP10]], [[TMP0]]
-; CHECK-PWR9-NEXT: store half [[MUL3_2]], half* [[ADD_PTR7_1]], align 2
-; CHECK-PWR9-NEXT: [[ARRAYIDX6_2:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 5
-; CHECK-PWR9-NEXT: store half [[MUL4_2]], half* [[ARRAYIDX6_2]], align 2
-; CHECK-PWR9-NEXT: [[ADD_PTR_2:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 6
-; CHECK-PWR9-NEXT: [[ADD_PTR7_2:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 6
-; CHECK-PWR9-NEXT: [[TMP11:%.*]] = load half, half* [[ADD_PTR_2]], align 2
-; CHECK-PWR9-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 7
-; CHECK-PWR9-NEXT: [[TMP12:%.*]] = load half, half* [[ARRAYIDX2_3]], align 2
-; CHECK-PWR9-NEXT: [[MUL3_3:%.*]] = fmul half [[TMP11]], [[TMP0]]
-; CHECK-PWR9-NEXT: [[MUL4_3:%.*]] = fmul half [[TMP12]], [[TMP0]]
-; CHECK-PWR9-NEXT: store half [[MUL3_3]], half* [[ADD_PTR7_2]], align 2
-; CHECK-PWR9-NEXT: [[ARRAYIDX6_3:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 7
-; CHECK-PWR9-NEXT: store half [[MUL4_3]], half* [[ARRAYIDX6_3]], align 2
-; CHECK-PWR9-NEXT: [[ADD_PTR_3:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 8
-; CHECK-PWR9-NEXT: [[ADD_PTR7_3:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 8
-; CHECK-PWR9-NEXT: [[TMP13:%.*]] = load half, half* [[ADD_PTR_3]], align 2
-; CHECK-PWR9-NEXT: [[ARRAYIDX2_4:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 9
-; CHECK-PWR9-NEXT: [[TMP14:%.*]] = load half, half* [[ARRAYIDX2_4]], align 2
-; CHECK-PWR9-NEXT: [[MUL3_4:%.*]] = fmul half [[TMP13]], [[TMP0]]
-; CHECK-PWR9-NEXT: [[MUL4_4:%.*]] = fmul half [[TMP14]], [[TMP0]]
-; CHECK-PWR9-NEXT: store half [[MUL3_4]], half* [[ADD_PTR7_3]], align 2
-; CHECK-PWR9-NEXT: [[ARRAYIDX6_4:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 9
-; CHECK-PWR9-NEXT: store half [[MUL4_4]], half* [[ARRAYIDX6_4]], align 2
-; CHECK-PWR9-NEXT: [[ADD_PTR_4:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 10
-; CHECK-PWR9-NEXT: [[ADD_PTR7_4:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 10
-; CHECK-PWR9-NEXT: [[TMP15:%.*]] = load half, half* [[ADD_PTR_4]], align 2
-; CHECK-PWR9-NEXT: [[ARRAYIDX2_5:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 11
-; CHECK-PWR9-NEXT: [[TMP16:%.*]] = load half, half* [[ARRAYIDX2_5]], align 2
-; CHECK-PWR9-NEXT: [[MUL3_5:%.*]] = fmul half [[TMP15]], [[TMP0]]
-; CHECK-PWR9-NEXT: [[MUL4_5:%.*]] = fmul half [[TMP16]], [[TMP0]]
-; CHECK-PWR9-NEXT: store half [[MUL3_5]], half* [[ADD_PTR7_4]], align 2
-; CHECK-PWR9-NEXT: [[ARRAYIDX6_5:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 11
-; CHECK-PWR9-NEXT: store half [[MUL4_5]], half* [[ARRAYIDX6_5]], align 2
-; CHECK-PWR9-NEXT: [[ADD_PTR_5:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 12
-; CHECK-PWR9-NEXT: [[ADD_PTR7_5:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 12
-; CHECK-PWR9-NEXT: [[TMP17:%.*]] = load half, half* [[ADD_PTR_5]], align 2
-; CHECK-PWR9-NEXT: [[ARRAYIDX2_6:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 13
-; CHECK-PWR9-NEXT: [[TMP18:%.*]] = load half, half* [[ARRAYIDX2_6]], align 2
-; CHECK-PWR9-NEXT: [[MUL3_6:%.*]] = fmul half [[TMP17]], [[TMP0]]
-; CHECK-PWR9-NEXT: [[MUL4_6:%.*]] = fmul half [[TMP18]], [[TMP0]]
-; CHECK-PWR9-NEXT: store half [[MUL3_6]], half* [[ADD_PTR7_5]], align 2
-; CHECK-PWR9-NEXT: [[ARRAYIDX6_6:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 13
-; CHECK-PWR9-NEXT: store half [[MUL4_6]], half* [[ARRAYIDX6_6]], align 2
-; CHECK-PWR9-NEXT: [[ADD_PTR_6:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 14
-; CHECK-PWR9-NEXT: [[ADD_PTR7_6:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 14
-; CHECK-PWR9-NEXT: [[TMP19:%.*]] = load half, half* [[ADD_PTR_6]], align 2
-; CHECK-PWR9-NEXT: [[ARRAYIDX2_7:%.*]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 15
-; CHECK-PWR9-NEXT: [[TMP20:%.*]] = load half, half* [[ARRAYIDX2_7]], align 2
-; CHECK-PWR9-NEXT: [[MUL3_7:%.*]] = fmul half [[TMP19]], [[TMP0]]
-; CHECK-PWR9-NEXT: [[MUL4_7:%.*]] = fmul half [[TMP20]], [[TMP0]]
-; CHECK-PWR9-NEXT: store half [[MUL3_7]], half* [[ADD_PTR7_6]], align 2
-; CHECK-PWR9-NEXT: [[ARRAYIDX6_7:%.*]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 15
-; CHECK-PWR9-NEXT: store half [[MUL4_7]], half* [[ARRAYIDX6_7]], align 2
-; CHECK-PWR9-NEXT: [[ADD_PTR_7]] = getelementptr inbounds half, half* [[PIN_ADDR_029]], i64 16
-; CHECK-PWR9-NEXT: [[ADD_PTR7_7]] = getelementptr inbounds half, half* [[POUT_ADDR_028]], i64 16
-; CHECK-PWR9-NEXT: [[DEC_7]] = add nsw i32 [[BLKCNT_027]], -8
-; CHECK-PWR9-NEXT: [[CMP_7:%.*]] = icmp eq i32 [[DEC_7]], 0
-; CHECK-PWR9-NEXT: br i1 [[CMP_7]], label [[WHILE_END]], label [[WHILE_BODY]]
-; CHECK-PWR9: while.end:
-; CHECK-PWR9-NEXT: ret void
-;
+;CHECK-LABEL: fp16_
+;CHECK: LV(REG): VF = 1
+;CHECK: LV(REG): Found max usage: 2 item
+;CHECK: LV(REG): RegisterClass: PPC::GPRRC, 4 registers
+;CHECK: LV(REG): RegisterClass: PPC::VSXRC, 2 registers
entry:
%tmp.0.extract.trunc = trunc i32 %scale.coerce to i16
%0 = bitcast i16 %tmp.0.extract.trunc to half
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -basic-aa -loop-vectorize < %s | FileCheck %s
target datalayout = "E-m:e-i64:64-n32:64"
target triple = "powerpc64-unknown-linux-gnu"
; Function Attrs: nounwind
define void @foo(double* noalias nocapture %a, double* noalias nocapture readonly %b) #0 {
-; CHECK-LABEL: @foo(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 6
-; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 8
-; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 10
-; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 12
-; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX]], 14
-; CHECK-NEXT: [[TMP8:%.*]] = shl nsw i64 [[TMP0]], 1
-; CHECK-NEXT: [[TMP9:%.*]] = shl nsw i64 [[TMP1]], 1
-; CHECK-NEXT: [[TMP10:%.*]] = shl nsw i64 [[TMP2]], 1
-; CHECK-NEXT: [[TMP11:%.*]] = shl nsw i64 [[TMP3]], 1
-; CHECK-NEXT: [[TMP12:%.*]] = shl nsw i64 [[TMP4]], 1
-; CHECK-NEXT: [[TMP13:%.*]] = shl nsw i64 [[TMP5]], 1
-; CHECK-NEXT: [[TMP14:%.*]] = shl nsw i64 [[TMP6]], 1
-; CHECK-NEXT: [[TMP15:%.*]] = shl nsw i64 [[TMP7]], 1
-; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 [[TMP8]]
-; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP9]]
-; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP10]]
-; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP11]]
-; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP12]]
-; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP13]]
-; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP14]]
-; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP15]]
-; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds double, double* [[TMP16]], i32 0
-; CHECK-NEXT: [[TMP25:%.*]] = bitcast double* [[TMP24]] to <4 x double>*
-; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds double, double* [[TMP17]], i32 0
-; CHECK-NEXT: [[TMP27:%.*]] = bitcast double* [[TMP26]] to <4 x double>*
-; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[TMP18]], i32 0
-; CHECK-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP28]] to <4 x double>*
-; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP19]], i32 0
-; CHECK-NEXT: [[TMP31:%.*]] = bitcast double* [[TMP30]] to <4 x double>*
-; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds double, double* [[TMP20]], i32 0
-; CHECK-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP32]] to <4 x double>*
-; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[TMP21]], i32 0
-; CHECK-NEXT: [[TMP35:%.*]] = bitcast double* [[TMP34]] to <4 x double>*
-; CHECK-NEXT: [[TMP36:%.*]] = getelementptr inbounds double, double* [[TMP22]], i32 0
-; CHECK-NEXT: [[TMP37:%.*]] = bitcast double* [[TMP36]] to <4 x double>*
-; CHECK-NEXT: [[TMP38:%.*]] = getelementptr inbounds double, double* [[TMP23]], i32 0
-; CHECK-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <4 x double>*
-; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <4 x double>, <4 x double>* [[TMP25]], align 8
-; CHECK-NEXT: [[WIDE_VEC1:%.*]] = load <4 x double>, <4 x double>* [[TMP27]], align 8
-; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <4 x double>, <4 x double>* [[TMP29]], align 8
-; CHECK-NEXT: [[WIDE_VEC3:%.*]] = load <4 x double>, <4 x double>* [[TMP31]], align 8
-; CHECK-NEXT: [[WIDE_VEC4:%.*]] = load <4 x double>, <4 x double>* [[TMP33]], align 8
-; CHECK-NEXT: [[WIDE_VEC5:%.*]] = load <4 x double>, <4 x double>* [[TMP35]], align 8
-; CHECK-NEXT: [[WIDE_VEC6:%.*]] = load <4 x double>, <4 x double>* [[TMP37]], align 8
-; CHECK-NEXT: [[WIDE_VEC7:%.*]] = load <4 x double>, <4 x double>* [[TMP39]], align 8
-; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[STRIDED_VEC8:%.*]] = shufflevector <4 x double> [[WIDE_VEC1]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[STRIDED_VEC9:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[STRIDED_VEC10:%.*]] = shufflevector <4 x double> [[WIDE_VEC3]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[STRIDED_VEC11:%.*]] = shufflevector <4 x double> [[WIDE_VEC4]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[STRIDED_VEC12:%.*]] = shufflevector <4 x double> [[WIDE_VEC5]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[STRIDED_VEC13:%.*]] = shufflevector <4 x double> [[WIDE_VEC6]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[STRIDED_VEC14:%.*]] = shufflevector <4 x double> [[WIDE_VEC7]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT: [[STRIDED_VEC15:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT: [[STRIDED_VEC16:%.*]] = shufflevector <4 x double> [[WIDE_VEC1]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT: [[STRIDED_VEC17:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT: [[STRIDED_VEC18:%.*]] = shufflevector <4 x double> [[WIDE_VEC3]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT: [[STRIDED_VEC19:%.*]] = shufflevector <4 x double> [[WIDE_VEC4]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT: [[STRIDED_VEC20:%.*]] = shufflevector <4 x double> [[WIDE_VEC5]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT: [[STRIDED_VEC21:%.*]] = shufflevector <4 x double> [[WIDE_VEC6]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT: [[STRIDED_VEC22:%.*]] = shufflevector <4 x double> [[WIDE_VEC7]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT: [[TMP40:%.*]] = fadd <2 x double> [[STRIDED_VEC]], [[STRIDED_VEC15]]
-; CHECK-NEXT: [[TMP41:%.*]] = fadd <2 x double> [[STRIDED_VEC8]], [[STRIDED_VEC16]]
-; CHECK-NEXT: [[TMP42:%.*]] = fadd <2 x double> [[STRIDED_VEC9]], [[STRIDED_VEC17]]
-; CHECK-NEXT: [[TMP43:%.*]] = fadd <2 x double> [[STRIDED_VEC10]], [[STRIDED_VEC18]]
-; CHECK-NEXT: [[TMP44:%.*]] = fadd <2 x double> [[STRIDED_VEC11]], [[STRIDED_VEC19]]
-; CHECK-NEXT: [[TMP45:%.*]] = fadd <2 x double> [[STRIDED_VEC12]], [[STRIDED_VEC20]]
-; CHECK-NEXT: [[TMP46:%.*]] = fadd <2 x double> [[STRIDED_VEC13]], [[STRIDED_VEC21]]
-; CHECK-NEXT: [[TMP47:%.*]] = fadd <2 x double> [[STRIDED_VEC14]], [[STRIDED_VEC22]]
-; CHECK-NEXT: [[TMP48:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP49:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP50:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP2]]
-; CHECK-NEXT: [[TMP51:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP3]]
-; CHECK-NEXT: [[TMP52:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP53:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP5]]
-; CHECK-NEXT: [[TMP54:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP6]]
-; CHECK-NEXT: [[TMP55:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP7]]
-; CHECK-NEXT: [[TMP56:%.*]] = getelementptr inbounds double, double* [[TMP48]], i32 0
-; CHECK-NEXT: [[TMP57:%.*]] = bitcast double* [[TMP56]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP40]], <2 x double>* [[TMP57]], align 8
-; CHECK-NEXT: [[TMP58:%.*]] = getelementptr inbounds double, double* [[TMP48]], i32 2
-; CHECK-NEXT: [[TMP59:%.*]] = bitcast double* [[TMP58]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP41]], <2 x double>* [[TMP59]], align 8
-; CHECK-NEXT: [[TMP60:%.*]] = getelementptr inbounds double, double* [[TMP48]], i32 4
-; CHECK-NEXT: [[TMP61:%.*]] = bitcast double* [[TMP60]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP42]], <2 x double>* [[TMP61]], align 8
-; CHECK-NEXT: [[TMP62:%.*]] = getelementptr inbounds double, double* [[TMP48]], i32 6
-; CHECK-NEXT: [[TMP63:%.*]] = bitcast double* [[TMP62]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP43]], <2 x double>* [[TMP63]], align 8
-; CHECK-NEXT: [[TMP64:%.*]] = getelementptr inbounds double, double* [[TMP48]], i32 8
-; CHECK-NEXT: [[TMP65:%.*]] = bitcast double* [[TMP64]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP44]], <2 x double>* [[TMP65]], align 8
-; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds double, double* [[TMP48]], i32 10
-; CHECK-NEXT: [[TMP67:%.*]] = bitcast double* [[TMP66]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP45]], <2 x double>* [[TMP67]], align 8
-; CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds double, double* [[TMP48]], i32 12
-; CHECK-NEXT: [[TMP69:%.*]] = bitcast double* [[TMP68]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP46]], <2 x double>* [[TMP69]], align 8
-; CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds double, double* [[TMP48]], i32 14
-; CHECK-NEXT: [[TMP71:%.*]] = bitcast double* [[TMP70]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP47]], <2 x double>* [[TMP71]], align 8
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT: [[TMP72:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1600
-; CHECK-NEXT: br i1 [[TMP72]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1600, 1600
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1600, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: ret void
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP73:%.*]] = shl nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[ODD_IDX:%.*]] = add nsw i64 [[TMP73]], 1
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP73]]
-; CHECK-NEXT: [[ARRAYIDX_ODD:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[ODD_IDX]]
-; CHECK-NEXT: [[TMP74:%.*]] = load double, double* [[ARRAYIDX]], align 8
-; CHECK-NEXT: [[TMP75:%.*]] = load double, double* [[ARRAYIDX_ODD]], align 8
-; CHECK-NEXT: [[ADD:%.*]] = fadd double [[TMP74]], [[TMP75]]
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store double [[ADD]], double* [[ARRAYIDX2]], align 8
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1600
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-;
entry:
br label %for.body
+; CHECK-LABEL: @foo
+; CHECK: <2 x double>
for.cond.cleanup: ; preds = %for.body
ret void
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -loop-vectorize < %s | FileCheck %s
target datalayout = "E-m:e-i64:64-n32:64"
target triple = "powerpc64le-unknown-linux"
; Function Attrs: nounwind
define zeroext i32 @test() #0 {
-; CHECK-LABEL: @test(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[A:%.*]] = alloca [1600 x i32], align 4
-; CHECK-NEXT: [[C:%.*]] = alloca [1600 x i32], align 4
-; CHECK-NEXT: [[TMP0:%.*]] = bitcast [1600 x i32]* [[A]] to i8*
-; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 6400, i8* [[TMP0]]) #[[ATTR3:[0-9]+]]
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[INDEX]] to i32
-; CHECK-NEXT: [[INDUCTION:%.*]] = add i32 [[TMP1]], 0
-; CHECK-NEXT: [[INDUCTION1:%.*]] = add i32 [[TMP1]], 1
-; CHECK-NEXT: [[INDUCTION2:%.*]] = add i32 [[TMP1]], 2
-; CHECK-NEXT: [[INDUCTION3:%.*]] = add i32 [[TMP1]], 3
-; CHECK-NEXT: [[INDUCTION4:%.*]] = add i32 [[TMP1]], 4
-; CHECK-NEXT: [[INDUCTION5:%.*]] = add i32 [[TMP1]], 5
-; CHECK-NEXT: [[INDUCTION6:%.*]] = add i32 [[TMP1]], 6
-; CHECK-NEXT: [[INDUCTION7:%.*]] = add i32 [[TMP1]], 7
-; CHECK-NEXT: [[INDUCTION8:%.*]] = add i32 [[TMP1]], 8
-; CHECK-NEXT: [[INDUCTION9:%.*]] = add i32 [[TMP1]], 9
-; CHECK-NEXT: [[INDUCTION10:%.*]] = add i32 [[TMP1]], 10
-; CHECK-NEXT: [[INDUCTION11:%.*]] = add i32 [[TMP1]], 11
-; CHECK-NEXT: [[INDUCTION12:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[INDUCTION13:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT: [[INDUCTION14:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT: [[INDUCTION15:%.*]] = add i64 [[INDEX]], 3
-; CHECK-NEXT: [[INDUCTION16:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[INDUCTION17:%.*]] = add i64 [[INDEX]], 5
-; CHECK-NEXT: [[INDUCTION18:%.*]] = add i64 [[INDEX]], 6
-; CHECK-NEXT: [[INDUCTION19:%.*]] = add i64 [[INDEX]], 7
-; CHECK-NEXT: [[INDUCTION20:%.*]] = add i64 [[INDEX]], 8
-; CHECK-NEXT: [[INDUCTION21:%.*]] = add i64 [[INDEX]], 9
-; CHECK-NEXT: [[INDUCTION22:%.*]] = add i64 [[INDEX]], 10
-; CHECK-NEXT: [[INDUCTION23:%.*]] = add i64 [[INDEX]], 11
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1600 x i32], [1600 x i32]* [[A]], i64 0, i64 [[INDUCTION12]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1600 x i32], [1600 x i32]* [[A]], i64 0, i64 [[INDUCTION13]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1600 x i32], [1600 x i32]* [[A]], i64 0, i64 [[INDUCTION14]]
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1600 x i32], [1600 x i32]* [[A]], i64 0, i64 [[INDUCTION15]]
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1600 x i32], [1600 x i32]* [[A]], i64 0, i64 [[INDUCTION16]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1600 x i32], [1600 x i32]* [[A]], i64 0, i64 [[INDUCTION17]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1600 x i32], [1600 x i32]* [[A]], i64 0, i64 [[INDUCTION18]]
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [1600 x i32], [1600 x i32]* [[A]], i64 0, i64 [[INDUCTION19]]
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1600 x i32], [1600 x i32]* [[A]], i64 0, i64 [[INDUCTION20]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1600 x i32], [1600 x i32]* [[A]], i64 0, i64 [[INDUCTION21]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [1600 x i32], [1600 x i32]* [[A]], i64 0, i64 [[INDUCTION22]]
-; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [1600 x i32], [1600 x i32]* [[A]], i64 0, i64 [[INDUCTION23]]
-; CHECK-NEXT: store i32 [[INDUCTION]], i32* [[TMP2]], align 4
-; CHECK-NEXT: store i32 [[INDUCTION1]], i32* [[TMP3]], align 4
-; CHECK-NEXT: store i32 [[INDUCTION2]], i32* [[TMP4]], align 4
-; CHECK-NEXT: store i32 [[INDUCTION3]], i32* [[TMP5]], align 4
-; CHECK-NEXT: store i32 [[INDUCTION4]], i32* [[TMP6]], align 4
-; CHECK-NEXT: store i32 [[INDUCTION5]], i32* [[TMP7]], align 4
-; CHECK-NEXT: store i32 [[INDUCTION6]], i32* [[TMP8]], align 4
-; CHECK-NEXT: store i32 [[INDUCTION7]], i32* [[TMP9]], align 4
-; CHECK-NEXT: store i32 [[INDUCTION8]], i32* [[TMP10]], align 4
-; CHECK-NEXT: store i32 [[INDUCTION9]], i32* [[TMP11]], align 4
-; CHECK-NEXT: store i32 [[INDUCTION10]], i32* [[TMP12]], align 4
-; CHECK-NEXT: store i32 [[INDUCTION11]], i32* [[TMP13]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 12
-; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1596
-; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1600, 1596
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1596, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast [1600 x i32]* [[C]] to i8*
-; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 6400, i8* [[TMP15]]) #[[ATTR3]]
-; CHECK-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [1600 x i32], [1600 x i32]* [[A]], i64 0, i64 0
-; CHECK-NEXT: [[ARRAYDECAY1:%.*]] = getelementptr inbounds [1600 x i32], [1600 x i32]* [[C]], i64 0, i64 0
-; CHECK-NEXT: [[CALL:%.*]] = call signext i32 @bar(i32* [[ARRAYDECAY]], i32* [[ARRAYDECAY1]]) #[[ATTR3]]
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH25:%.*]], label [[VECTOR_PH27:%.*]]
-; CHECK: vector.ph27:
-; CHECK-NEXT: br label [[VECTOR_BODY26:%.*]]
-; CHECK: vector.body26:
-; CHECK-NEXT: [[INDEX30:%.*]] = phi i64 [ 0, [[VECTOR_PH27]] ], [ [[INDEX_NEXT46:%.*]], [[VECTOR_BODY26]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH27]] ], [ [[TMP32:%.*]], [[VECTOR_BODY26]] ]
-; CHECK-NEXT: [[VEC_PHI31:%.*]] = phi i32 [ 0, [[VECTOR_PH27]] ], [ [[TMP33:%.*]], [[VECTOR_BODY26]] ]
-; CHECK-NEXT: [[VEC_PHI32:%.*]] = phi i32 [ 0, [[VECTOR_PH27]] ], [ [[TMP34:%.*]], [[VECTOR_BODY26]] ]
-; CHECK-NEXT: [[VEC_PHI33:%.*]] = phi i32 [ 0, [[VECTOR_PH27]] ], [ [[TMP35:%.*]], [[VECTOR_BODY26]] ]
-; CHECK-NEXT: [[VEC_PHI34:%.*]] = phi i32 [ 0, [[VECTOR_PH27]] ], [ [[TMP36:%.*]], [[VECTOR_BODY26]] ]
-; CHECK-NEXT: [[VEC_PHI35:%.*]] = phi i32 [ 0, [[VECTOR_PH27]] ], [ [[TMP37:%.*]], [[VECTOR_BODY26]] ]
-; CHECK-NEXT: [[VEC_PHI36:%.*]] = phi i32 [ 0, [[VECTOR_PH27]] ], [ [[TMP38:%.*]], [[VECTOR_BODY26]] ]
-; CHECK-NEXT: [[VEC_PHI37:%.*]] = phi i32 [ 0, [[VECTOR_PH27]] ], [ [[TMP39:%.*]], [[VECTOR_BODY26]] ]
-; CHECK-NEXT: [[INDUCTION38:%.*]] = add i64 [[INDEX30]], 0
-; CHECK-NEXT: [[INDUCTION39:%.*]] = add i64 [[INDEX30]], 1
-; CHECK-NEXT: [[INDUCTION40:%.*]] = add i64 [[INDEX30]], 2
-; CHECK-NEXT: [[INDUCTION41:%.*]] = add i64 [[INDEX30]], 3
-; CHECK-NEXT: [[INDUCTION42:%.*]] = add i64 [[INDEX30]], 4
-; CHECK-NEXT: [[INDUCTION43:%.*]] = add i64 [[INDEX30]], 5
-; CHECK-NEXT: [[INDUCTION44:%.*]] = add i64 [[INDEX30]], 6
-; CHECK-NEXT: [[INDUCTION45:%.*]] = add i64 [[INDEX30]], 7
-; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [1600 x i32], [1600 x i32]* [[C]], i64 0, i64 [[INDUCTION38]]
-; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds [1600 x i32], [1600 x i32]* [[C]], i64 0, i64 [[INDUCTION39]]
-; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [1600 x i32], [1600 x i32]* [[C]], i64 0, i64 [[INDUCTION40]]
-; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds [1600 x i32], [1600 x i32]* [[C]], i64 0, i64 [[INDUCTION41]]
-; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds [1600 x i32], [1600 x i32]* [[C]], i64 0, i64 [[INDUCTION42]]
-; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [1600 x i32], [1600 x i32]* [[C]], i64 0, i64 [[INDUCTION43]]
-; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [1600 x i32], [1600 x i32]* [[C]], i64 0, i64 [[INDUCTION44]]
-; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds [1600 x i32], [1600 x i32]* [[C]], i64 0, i64 [[INDUCTION45]]
-; CHECK-NEXT: [[TMP24:%.*]] = load i32, i32* [[TMP16]], align 4
-; CHECK-NEXT: [[TMP25:%.*]] = load i32, i32* [[TMP17]], align 4
-; CHECK-NEXT: [[TMP26:%.*]] = load i32, i32* [[TMP18]], align 4
-; CHECK-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP19]], align 4
-; CHECK-NEXT: [[TMP28:%.*]] = load i32, i32* [[TMP20]], align 4
-; CHECK-NEXT: [[TMP29:%.*]] = load i32, i32* [[TMP21]], align 4
-; CHECK-NEXT: [[TMP30:%.*]] = load i32, i32* [[TMP22]], align 4
-; CHECK-NEXT: [[TMP31:%.*]] = load i32, i32* [[TMP23]], align 4
-; CHECK-NEXT: [[TMP32]] = add i32 [[TMP24]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP33]] = add i32 [[TMP25]], [[VEC_PHI31]]
-; CHECK-NEXT: [[TMP34]] = add i32 [[TMP26]], [[VEC_PHI32]]
-; CHECK-NEXT: [[TMP35]] = add i32 [[TMP27]], [[VEC_PHI33]]
-; CHECK-NEXT: [[TMP36]] = add i32 [[TMP28]], [[VEC_PHI34]]
-; CHECK-NEXT: [[TMP37]] = add i32 [[TMP29]], [[VEC_PHI35]]
-; CHECK-NEXT: [[TMP38]] = add i32 [[TMP30]], [[VEC_PHI36]]
-; CHECK-NEXT: [[TMP39]] = add i32 [[TMP31]], [[VEC_PHI37]]
-; CHECK-NEXT: [[INDEX_NEXT46]] = add nuw i64 [[INDEX30]], 8
-; CHECK-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT46]], 1600
-; CHECK-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK24:%.*]], label [[VECTOR_BODY26]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: middle.block24:
-; CHECK-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP33]], [[TMP32]]
-; CHECK-NEXT: [[BIN_RDX47:%.*]] = add i32 [[TMP34]], [[BIN_RDX]]
-; CHECK-NEXT: [[BIN_RDX48:%.*]] = add i32 [[TMP35]], [[BIN_RDX47]]
-; CHECK-NEXT: [[BIN_RDX49:%.*]] = add i32 [[TMP36]], [[BIN_RDX48]]
-; CHECK-NEXT: [[BIN_RDX50:%.*]] = add i32 [[TMP37]], [[BIN_RDX49]]
-; CHECK-NEXT: [[BIN_RDX51:%.*]] = add i32 [[TMP38]], [[BIN_RDX50]]
-; CHECK-NEXT: [[BIN_RDX52:%.*]] = add i32 [[TMP39]], [[BIN_RDX51]]
-; CHECK-NEXT: [[CMP_N29:%.*]] = icmp eq i64 1600, 1600
-; CHECK-NEXT: br i1 [[CMP_N29]], label [[FOR_COND_CLEANUP5:%.*]], label [[SCALAR_PH25]]
-; CHECK: scalar.ph25:
-; CHECK-NEXT: [[BC_RESUME_VAL28:%.*]] = phi i64 [ 1600, [[MIDDLE_BLOCK24]] ], [ 0, [[FOR_COND_CLEANUP]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_COND_CLEANUP]] ], [ [[BIN_RDX52]], [[MIDDLE_BLOCK24]] ]
-; CHECK-NEXT: br label [[FOR_BODY6:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV25:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT26:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1600 x i32], [1600 x i32]* [[A]], i64 0, i64 [[INDVARS_IV25]]
-; CHECK-NEXT: [[TMP41:%.*]] = trunc i64 [[INDVARS_IV25]] to i32
-; CHECK-NEXT: store i32 [[TMP41]], i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT26]] = add nuw nsw i64 [[INDVARS_IV25]], 1
-; CHECK-NEXT: [[EXITCOND27:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT26]], 1600
-; CHECK-NEXT: br i1 [[EXITCOND27]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK: for.cond.cleanup5:
-; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY6]] ], [ [[BIN_RDX52]], [[MIDDLE_BLOCK24]] ]
-; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 6400, i8* nonnull [[TMP15]]) #[[ATTR3]]
-; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 6400, i8* [[TMP0]]) #[[ATTR3]]
-; CHECK-NEXT: ret i32 [[ADD_LCSSA]]
-; CHECK: for.body6:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL28]], [[SCALAR_PH25]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY6]] ]
-; CHECK-NEXT: [[S_022:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH25]] ], [ [[ADD]], [[FOR_BODY6]] ]
-; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds [1600 x i32], [1600 x i32]* [[C]], i64 0, i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP42:%.*]] = load i32, i32* [[ARRAYIDX8]], align 4
-; CHECK-NEXT: [[ADD]] = add i32 [[TMP42]], [[S_022]]
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1600
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP5]], label [[FOR_BODY6]], !llvm.loop [[LOOP4:![0-9]+]]
-;
+; CHECK-LABEL: @test
+; CHECK-NOT: x i32>
entry:
%a = alloca [1600 x i32], align 4
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -mcpu=pwr7 -mattr=+vsx -loop-vectorize -instcombine -S | FileCheck %s
target datalayout = "E-m:e-i64:64-n32:64"
target triple = "powerpc64-unknown-linux-gnu"
@ntimes = external hidden unnamed_addr global i32, align 4
define signext i32 @s173() #0 {
-; CHECK-LABEL: @s173(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* @ntimes, align 4
-; CHECK-NEXT: [[CMP21:%.*]] = icmp sgt i32 [[TMP0]], 0
-; CHECK-NEXT: br i1 [[CMP21]], label [[FOR_COND1_PREHEADER_PREHEADER:%.*]], label [[FOR_END12:%.*]]
-; CHECK: for.cond1.preheader.preheader:
-; CHECK-NEXT: br label [[FOR_COND1_PREHEADER:%.*]]
-; CHECK: for.cond1.preheader:
-; CHECK-NEXT: [[NL_022:%.*]] = phi i32 [ [[INC11:%.*]], [[FOR_END:%.*]] ], [ 0, [[FOR_COND1_PREHEADER_PREHEADER]] ]
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA:%.*]], %struct.GlobalData* @global_data, i64 0, i32 0, i64 [[INDEX]]
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[TMP1]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 16
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 3, i64 [[INDEX]]
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[TMP3]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 16
-; CHECK-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; CHECK-NEXT: [[TMP6:%.*]] = add nsw i64 [[INDEX]], 16000
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_GLOBALDATA]], %struct.GlobalData* @global_data, i64 0, i32 0, i64 [[TMP6]]
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP7]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP5]], <4 x float>* [[TMP8]], align 16
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16000
-; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: br i1 true, label [[FOR_END]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: br label [[FOR_BODY3:%.*]]
-; CHECK: for.body3:
-; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY3]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: [[INC11]] = add nuw nsw i32 [[NL_022]], 1
-; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* @ntimes, align 4
-; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP10]], 10
-; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC11]], [[MUL]]
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_COND1_PREHEADER]], label [[FOR_END12_LOOPEXIT:%.*]]
-; CHECK: for.end12.loopexit:
-; CHECK-NEXT: br label [[FOR_END12]]
-; CHECK: for.end12:
-; CHECK-NEXT: ret i32 0
-;
entry:
%0 = load i32, i32* @ntimes, align 4
%cmp21 = icmp sgt i32 %0, 0
for.end12: ; preds = %for.end, %entry
ret i32 0
+; CHECK-LABEL: @s173
+; CHECK: load <4 x float>, <4 x float>*
+; CHECK: add nsw i64 %index, 16000
+; CHECK: ret i32 0
}
attributes #0 = { nounwind }
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; REQUIRES: asserts
; RUN: opt -loop-vectorize -mtriple riscv64-linux-gnu \
; RUN: -mattr=+v,+d -debug-only=loop-vectorize \
define void @add(float* noalias nocapture readonly %src1, float* noalias nocapture readonly %src2, i32 signext %size, float* noalias nocapture writeonly %result) {
; CHECK-LABEL: add
-; CHECK-LMUL1-LABEL: @add(
-; CHECK-LMUL1-NEXT: entry:
-; CHECK-LMUL1-NEXT: [[CONV:%.*]] = zext i32 [[SIZE:%.*]] to i64
-; CHECK-LMUL1-NEXT: [[CMP10_NOT:%.*]] = icmp eq i32 [[SIZE]], 0
-; CHECK-LMUL1-NEXT: br i1 [[CMP10_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
-; CHECK-LMUL1: for.body.preheader:
-; CHECK-LMUL1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[CONV]], 8
-; CHECK-LMUL1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-LMUL1: vector.ph:
-; CHECK-LMUL1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[CONV]], 8
-; CHECK-LMUL1-NEXT: [[N_VEC:%.*]] = sub i64 [[CONV]], [[N_MOD_VF]]
-; CHECK-LMUL1-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-LMUL1: vector.body:
-; CHECK-LMUL1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-LMUL1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-LMUL1-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-LMUL1-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[SRC1:%.*]], i64 [[TMP0]]
-; CHECK-LMUL1-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[SRC1]], i64 [[TMP1]]
-; CHECK-LMUL1-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-LMUL1-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-LMUL1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4
-; CHECK-LMUL1-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 4
-; CHECK-LMUL1-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-LMUL1-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
-; CHECK-LMUL1-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[SRC2:%.*]], i64 [[TMP0]]
-; CHECK-LMUL1-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[SRC2]], i64 [[TMP1]]
-; CHECK-LMUL1-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP8]], i32 0
-; CHECK-LMUL1-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to <4 x float>*
-; CHECK-LMUL1-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, <4 x float>* [[TMP11]], align 4
-; CHECK-LMUL1-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP8]], i32 4
-; CHECK-LMUL1-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <4 x float>*
-; CHECK-LMUL1-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x float>, <4 x float>* [[TMP13]], align 4
-; CHECK-LMUL1-NEXT: [[TMP14:%.*]] = fadd <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]]
-; CHECK-LMUL1-NEXT: [[TMP15:%.*]] = fadd <4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD3]]
-; CHECK-LMUL1-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[RESULT:%.*]], i64 [[TMP0]]
-; CHECK-LMUL1-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* [[RESULT]], i64 [[TMP1]]
-; CHECK-LMUL1-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 0
-; CHECK-LMUL1-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP18]] to <4 x float>*
-; CHECK-LMUL1-NEXT: store <4 x float> [[TMP14]], <4 x float>* [[TMP19]], align 4
-; CHECK-LMUL1-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 4
-; CHECK-LMUL1-NEXT: [[TMP21:%.*]] = bitcast float* [[TMP20]] to <4 x float>*
-; CHECK-LMUL1-NEXT: store <4 x float> [[TMP15]], <4 x float>* [[TMP21]], align 4
-; CHECK-LMUL1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-LMUL1-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-LMUL1-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK-LMUL1: middle.block:
-; CHECK-LMUL1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[CONV]], [[N_VEC]]
-; CHECK-LMUL1-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-LMUL1: scalar.ph:
-; CHECK-LMUL1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-LMUL1-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-LMUL1: for.cond.cleanup.loopexit:
-; CHECK-LMUL1-NEXT: br label [[FOR_COND_CLEANUP]]
-; CHECK-LMUL1: for.cond.cleanup:
-; CHECK-LMUL1-NEXT: ret void
-; CHECK-LMUL1: for.body:
-; CHECK-LMUL1-NEXT: [[I_011:%.*]] = phi i64 [ [[ADD4:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-LMUL1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[SRC1]], i64 [[I_011]]
-; CHECK-LMUL1-NEXT: [[TMP23:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-LMUL1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[SRC2]], i64 [[I_011]]
-; CHECK-LMUL1-NEXT: [[TMP24:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-LMUL1-NEXT: [[ADD:%.*]] = fadd float [[TMP23]], [[TMP24]]
-; CHECK-LMUL1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[RESULT]], i64 [[I_011]]
-; CHECK-LMUL1-NEXT: store float [[ADD]], float* [[ARRAYIDX3]], align 4
-; CHECK-LMUL1-NEXT: [[ADD4]] = add nuw nsw i64 [[I_011]], 1
-; CHECK-LMUL1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[ADD4]], [[CONV]]
-; CHECK-LMUL1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-;
-; CHECK-LMUL2-LABEL: @add(
-; CHECK-LMUL2-NEXT: entry:
-; CHECK-LMUL2-NEXT: [[CONV:%.*]] = zext i32 [[SIZE:%.*]] to i64
-; CHECK-LMUL2-NEXT: [[CMP10_NOT:%.*]] = icmp eq i32 [[SIZE]], 0
-; CHECK-LMUL2-NEXT: br i1 [[CMP10_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
-; CHECK-LMUL2: for.body.preheader:
-; CHECK-LMUL2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[CONV]], 16
-; CHECK-LMUL2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-LMUL2: vector.ph:
-; CHECK-LMUL2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[CONV]], 16
-; CHECK-LMUL2-NEXT: [[N_VEC:%.*]] = sub i64 [[CONV]], [[N_MOD_VF]]
-; CHECK-LMUL2-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-LMUL2: vector.body:
-; CHECK-LMUL2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-LMUL2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-LMUL2-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8
-; CHECK-LMUL2-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[SRC1:%.*]], i64 [[TMP0]]
-; CHECK-LMUL2-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[SRC1]], i64 [[TMP1]]
-; CHECK-LMUL2-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-LMUL2-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <8 x float>*
-; CHECK-LMUL2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP5]], align 4
-; CHECK-LMUL2-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 8
-; CHECK-LMUL2-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <8 x float>*
-; CHECK-LMUL2-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP7]], align 4
-; CHECK-LMUL2-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[SRC2:%.*]], i64 [[TMP0]]
-; CHECK-LMUL2-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[SRC2]], i64 [[TMP1]]
-; CHECK-LMUL2-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP8]], i32 0
-; CHECK-LMUL2-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to <8 x float>*
-; CHECK-LMUL2-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x float>, <8 x float>* [[TMP11]], align 4
-; CHECK-LMUL2-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP8]], i32 8
-; CHECK-LMUL2-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <8 x float>*
-; CHECK-LMUL2-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x float>, <8 x float>* [[TMP13]], align 4
-; CHECK-LMUL2-NEXT: [[TMP14:%.*]] = fadd <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]]
-; CHECK-LMUL2-NEXT: [[TMP15:%.*]] = fadd <8 x float> [[WIDE_LOAD1]], [[WIDE_LOAD3]]
-; CHECK-LMUL2-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[RESULT:%.*]], i64 [[TMP0]]
-; CHECK-LMUL2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* [[RESULT]], i64 [[TMP1]]
-; CHECK-LMUL2-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 0
-; CHECK-LMUL2-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP18]] to <8 x float>*
-; CHECK-LMUL2-NEXT: store <8 x float> [[TMP14]], <8 x float>* [[TMP19]], align 4
-; CHECK-LMUL2-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 8
-; CHECK-LMUL2-NEXT: [[TMP21:%.*]] = bitcast float* [[TMP20]] to <8 x float>*
-; CHECK-LMUL2-NEXT: store <8 x float> [[TMP15]], <8 x float>* [[TMP21]], align 4
-; CHECK-LMUL2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-LMUL2-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-LMUL2-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK-LMUL2: middle.block:
-; CHECK-LMUL2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[CONV]], [[N_VEC]]
-; CHECK-LMUL2-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-LMUL2: scalar.ph:
-; CHECK-LMUL2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-LMUL2-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-LMUL2: for.cond.cleanup.loopexit:
-; CHECK-LMUL2-NEXT: br label [[FOR_COND_CLEANUP]]
-; CHECK-LMUL2: for.cond.cleanup:
-; CHECK-LMUL2-NEXT: ret void
-; CHECK-LMUL2: for.body:
-; CHECK-LMUL2-NEXT: [[I_011:%.*]] = phi i64 [ [[ADD4:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-LMUL2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[SRC1]], i64 [[I_011]]
-; CHECK-LMUL2-NEXT: [[TMP23:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-LMUL2-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[SRC2]], i64 [[I_011]]
-; CHECK-LMUL2-NEXT: [[TMP24:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-LMUL2-NEXT: [[ADD:%.*]] = fadd float [[TMP23]], [[TMP24]]
-; CHECK-LMUL2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[RESULT]], i64 [[I_011]]
-; CHECK-LMUL2-NEXT: store float [[ADD]], float* [[ARRAYIDX3]], align 4
-; CHECK-LMUL2-NEXT: [[ADD4]] = add nuw nsw i64 [[I_011]], 1
-; CHECK-LMUL2-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[ADD4]], [[CONV]]
-; CHECK-LMUL2-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-;
-; CHECK-LMUL4-LABEL: @add(
-; CHECK-LMUL4-NEXT: entry:
-; CHECK-LMUL4-NEXT: [[CONV:%.*]] = zext i32 [[SIZE:%.*]] to i64
-; CHECK-LMUL4-NEXT: [[CMP10_NOT:%.*]] = icmp eq i32 [[SIZE]], 0
-; CHECK-LMUL4-NEXT: br i1 [[CMP10_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[ITER_CHECK:%.*]]
-; CHECK-LMUL4: iter.check:
-; CHECK-LMUL4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[CONV]], 8
-; CHECK-LMUL4-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; CHECK-LMUL4: vector.main.loop.iter.check:
-; CHECK-LMUL4-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[CONV]], 32
-; CHECK-LMUL4-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-LMUL4: vector.ph:
-; CHECK-LMUL4-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[CONV]], 32
-; CHECK-LMUL4-NEXT: [[N_VEC:%.*]] = sub i64 [[CONV]], [[N_MOD_VF]]
-; CHECK-LMUL4-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-LMUL4: vector.body:
-; CHECK-LMUL4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-LMUL4-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-LMUL4-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 16
-; CHECK-LMUL4-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[SRC1:%.*]], i64 [[TMP0]]
-; CHECK-LMUL4-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[SRC1]], i64 [[TMP1]]
-; CHECK-LMUL4-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-LMUL4-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <16 x float>*
-; CHECK-LMUL4-NEXT: [[WIDE_LOAD:%.*]] = load <16 x float>, <16 x float>* [[TMP5]], align 4
-; CHECK-LMUL4-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 16
-; CHECK-LMUL4-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <16 x float>*
-; CHECK-LMUL4-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x float>, <16 x float>* [[TMP7]], align 4
-; CHECK-LMUL4-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[SRC2:%.*]], i64 [[TMP0]]
-; CHECK-LMUL4-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[SRC2]], i64 [[TMP1]]
-; CHECK-LMUL4-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP8]], i32 0
-; CHECK-LMUL4-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to <16 x float>*
-; CHECK-LMUL4-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x float>, <16 x float>* [[TMP11]], align 4
-; CHECK-LMUL4-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP8]], i32 16
-; CHECK-LMUL4-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <16 x float>*
-; CHECK-LMUL4-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x float>, <16 x float>* [[TMP13]], align 4
-; CHECK-LMUL4-NEXT: [[TMP14:%.*]] = fadd <16 x float> [[WIDE_LOAD]], [[WIDE_LOAD3]]
-; CHECK-LMUL4-NEXT: [[TMP15:%.*]] = fadd <16 x float> [[WIDE_LOAD2]], [[WIDE_LOAD4]]
-; CHECK-LMUL4-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[RESULT:%.*]], i64 [[TMP0]]
-; CHECK-LMUL4-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* [[RESULT]], i64 [[TMP1]]
-; CHECK-LMUL4-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 0
-; CHECK-LMUL4-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP18]] to <16 x float>*
-; CHECK-LMUL4-NEXT: store <16 x float> [[TMP14]], <16 x float>* [[TMP19]], align 4
-; CHECK-LMUL4-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 16
-; CHECK-LMUL4-NEXT: [[TMP21:%.*]] = bitcast float* [[TMP20]] to <16 x float>*
-; CHECK-LMUL4-NEXT: store <16 x float> [[TMP15]], <16 x float>* [[TMP21]], align 4
-; CHECK-LMUL4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; CHECK-LMUL4-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-LMUL4-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK-LMUL4: middle.block:
-; CHECK-LMUL4-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[CONV]], [[N_VEC]]
-; CHECK-LMUL4-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
-; CHECK-LMUL4: vec.epilog.iter.check:
-; CHECK-LMUL4-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[CONV]], [[N_VEC]]
-; CHECK-LMUL4-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 8
-; CHECK-LMUL4-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
-; CHECK-LMUL4: vec.epilog.ph:
-; CHECK-LMUL4-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-LMUL4-NEXT: [[N_MOD_VF5:%.*]] = urem i64 [[CONV]], 8
-; CHECK-LMUL4-NEXT: [[N_VEC6:%.*]] = sub i64 [[CONV]], [[N_MOD_VF5]]
-; CHECK-LMUL4-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
-; CHECK-LMUL4: vec.epilog.vector.body:
-; CHECK-LMUL4-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-LMUL4-NEXT: [[TMP23:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-LMUL4-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[SRC1]], i64 [[TMP23]]
-; CHECK-LMUL4-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, float* [[TMP24]], i32 0
-; CHECK-LMUL4-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP25]] to <8 x float>*
-; CHECK-LMUL4-NEXT: [[WIDE_LOAD9:%.*]] = load <8 x float>, <8 x float>* [[TMP26]], align 4
-; CHECK-LMUL4-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, float* [[SRC2]], i64 [[TMP23]]
-; CHECK-LMUL4-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP27]], i32 0
-; CHECK-LMUL4-NEXT: [[TMP29:%.*]] = bitcast float* [[TMP28]] to <8 x float>*
-; CHECK-LMUL4-NEXT: [[WIDE_LOAD10:%.*]] = load <8 x float>, <8 x float>* [[TMP29]], align 4
-; CHECK-LMUL4-NEXT: [[TMP30:%.*]] = fadd <8 x float> [[WIDE_LOAD9]], [[WIDE_LOAD10]]
-; CHECK-LMUL4-NEXT: [[TMP31:%.*]] = getelementptr inbounds float, float* [[RESULT]], i64 [[TMP23]]
-; CHECK-LMUL4-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP31]], i32 0
-; CHECK-LMUL4-NEXT: [[TMP33:%.*]] = bitcast float* [[TMP32]] to <8 x float>*
-; CHECK-LMUL4-NEXT: store <8 x float> [[TMP30]], <8 x float>* [[TMP33]], align 4
-; CHECK-LMUL4-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[OFFSET_IDX]], 8
-; CHECK-LMUL4-NEXT: [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC6]]
-; CHECK-LMUL4-NEXT: br i1 [[TMP34]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK-LMUL4: vec.epilog.middle.block:
-; CHECK-LMUL4-NEXT: [[CMP_N7:%.*]] = icmp eq i64 [[CONV]], [[N_VEC6]]
-; CHECK-LMUL4-NEXT: br i1 [[CMP_N7]], label [[FOR_COND_CLEANUP_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
-; CHECK-LMUL4: vec.epilog.scalar.ph:
-; CHECK-LMUL4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC6]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ]
-; CHECK-LMUL4-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-LMUL4: for.cond.cleanup.loopexit.loopexit:
-; CHECK-LMUL4-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT]]
-; CHECK-LMUL4: for.cond.cleanup.loopexit:
-; CHECK-LMUL4-NEXT: br label [[FOR_COND_CLEANUP]]
-; CHECK-LMUL4: for.cond.cleanup:
-; CHECK-LMUL4-NEXT: ret void
-; CHECK-LMUL4: for.body:
-; CHECK-LMUL4-NEXT: [[I_011:%.*]] = phi i64 [ [[ADD4:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ]
-; CHECK-LMUL4-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[SRC1]], i64 [[I_011]]
-; CHECK-LMUL4-NEXT: [[TMP35:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-LMUL4-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[SRC2]], i64 [[I_011]]
-; CHECK-LMUL4-NEXT: [[TMP36:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-LMUL4-NEXT: [[ADD:%.*]] = fadd float [[TMP35]], [[TMP36]]
-; CHECK-LMUL4-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[RESULT]], i64 [[I_011]]
-; CHECK-LMUL4-NEXT: store float [[ADD]], float* [[ARRAYIDX3]], align 4
-; CHECK-LMUL4-NEXT: [[ADD4]] = add nuw nsw i64 [[I_011]], 1
-; CHECK-LMUL4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[ADD4]], [[CONV]]
-; CHECK-LMUL4-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-;
-; CHECK-LMUL8-LABEL: @add(
-; CHECK-LMUL8-NEXT: entry:
-; CHECK-LMUL8-NEXT: [[CONV:%.*]] = zext i32 [[SIZE:%.*]] to i64
-; CHECK-LMUL8-NEXT: [[CMP10_NOT:%.*]] = icmp eq i32 [[SIZE]], 0
-; CHECK-LMUL8-NEXT: br i1 [[CMP10_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[ITER_CHECK:%.*]]
-; CHECK-LMUL8: iter.check:
-; CHECK-LMUL8-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[CONV]], 16
-; CHECK-LMUL8-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; CHECK-LMUL8: vector.main.loop.iter.check:
-; CHECK-LMUL8-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[CONV]], 32
-; CHECK-LMUL8-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-LMUL8: vector.ph:
-; CHECK-LMUL8-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[CONV]], 32
-; CHECK-LMUL8-NEXT: [[N_VEC:%.*]] = sub i64 [[CONV]], [[N_MOD_VF]]
-; CHECK-LMUL8-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-LMUL8: vector.body:
-; CHECK-LMUL8-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-LMUL8-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-LMUL8-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[SRC1:%.*]], i64 [[TMP0]]
-; CHECK-LMUL8-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
-; CHECK-LMUL8-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <32 x float>*
-; CHECK-LMUL8-NEXT: [[WIDE_LOAD:%.*]] = load <32 x float>, <32 x float>* [[TMP3]], align 4
-; CHECK-LMUL8-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[SRC2:%.*]], i64 [[TMP0]]
-; CHECK-LMUL8-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
-; CHECK-LMUL8-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP5]] to <32 x float>*
-; CHECK-LMUL8-NEXT: [[WIDE_LOAD2:%.*]] = load <32 x float>, <32 x float>* [[TMP6]], align 4
-; CHECK-LMUL8-NEXT: [[TMP7:%.*]] = fadd <32 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]]
-; CHECK-LMUL8-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[RESULT:%.*]], i64 [[TMP0]]
-; CHECK-LMUL8-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP8]], i32 0
-; CHECK-LMUL8-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to <32 x float>*
-; CHECK-LMUL8-NEXT: store <32 x float> [[TMP7]], <32 x float>* [[TMP10]], align 4
-; CHECK-LMUL8-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; CHECK-LMUL8-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-LMUL8-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK-LMUL8: middle.block:
-; CHECK-LMUL8-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[CONV]], [[N_VEC]]
-; CHECK-LMUL8-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
-; CHECK-LMUL8: vec.epilog.iter.check:
-; CHECK-LMUL8-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[CONV]], [[N_VEC]]
-; CHECK-LMUL8-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 16
-; CHECK-LMUL8-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
-; CHECK-LMUL8: vec.epilog.ph:
-; CHECK-LMUL8-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-LMUL8-NEXT: [[N_MOD_VF3:%.*]] = urem i64 [[CONV]], 16
-; CHECK-LMUL8-NEXT: [[N_VEC4:%.*]] = sub i64 [[CONV]], [[N_MOD_VF3]]
-; CHECK-LMUL8-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
-; CHECK-LMUL8: vec.epilog.vector.body:
-; CHECK-LMUL8-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-LMUL8-NEXT: [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-LMUL8-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[SRC1]], i64 [[TMP12]]
-; CHECK-LMUL8-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP13]], i32 0
-; CHECK-LMUL8-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <16 x float>*
-; CHECK-LMUL8-NEXT: [[WIDE_LOAD7:%.*]] = load <16 x float>, <16 x float>* [[TMP15]], align 4
-; CHECK-LMUL8-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[SRC2]], i64 [[TMP12]]
-; CHECK-LMUL8-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 0
-; CHECK-LMUL8-NEXT: [[TMP18:%.*]] = bitcast float* [[TMP17]] to <16 x float>*
-; CHECK-LMUL8-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x float>, <16 x float>* [[TMP18]], align 4
-; CHECK-LMUL8-NEXT: [[TMP19:%.*]] = fadd <16 x float> [[WIDE_LOAD7]], [[WIDE_LOAD8]]
-; CHECK-LMUL8-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[RESULT]], i64 [[TMP12]]
-; CHECK-LMUL8-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 0
-; CHECK-LMUL8-NEXT: [[TMP22:%.*]] = bitcast float* [[TMP21]] to <16 x float>*
-; CHECK-LMUL8-NEXT: store <16 x float> [[TMP19]], <16 x float>* [[TMP22]], align 4
-; CHECK-LMUL8-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[OFFSET_IDX]], 16
-; CHECK-LMUL8-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC4]]
-; CHECK-LMUL8-NEXT: br i1 [[TMP23]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK-LMUL8: vec.epilog.middle.block:
-; CHECK-LMUL8-NEXT: [[CMP_N5:%.*]] = icmp eq i64 [[CONV]], [[N_VEC4]]
-; CHECK-LMUL8-NEXT: br i1 [[CMP_N5]], label [[FOR_COND_CLEANUP_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
-; CHECK-LMUL8: vec.epilog.scalar.ph:
-; CHECK-LMUL8-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC4]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ]
-; CHECK-LMUL8-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-LMUL8: for.cond.cleanup.loopexit.loopexit:
-; CHECK-LMUL8-NEXT: br label [[FOR_COND_CLEANUP_LOOPEXIT]]
-; CHECK-LMUL8: for.cond.cleanup.loopexit:
-; CHECK-LMUL8-NEXT: br label [[FOR_COND_CLEANUP]]
-; CHECK-LMUL8: for.cond.cleanup:
-; CHECK-LMUL8-NEXT: ret void
-; CHECK-LMUL8: for.body:
-; CHECK-LMUL8-NEXT: [[I_011:%.*]] = phi i64 [ [[ADD4:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ]
-; CHECK-LMUL8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[SRC1]], i64 [[I_011]]
-; CHECK-LMUL8-NEXT: [[TMP24:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-LMUL8-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[SRC2]], i64 [[I_011]]
-; CHECK-LMUL8-NEXT: [[TMP25:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-LMUL8-NEXT: [[ADD:%.*]] = fadd float [[TMP24]], [[TMP25]]
-; CHECK-LMUL8-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[RESULT]], i64 [[I_011]]
-; CHECK-LMUL8-NEXT: store float [[ADD]], float* [[ARRAYIDX3]], align 4
-; CHECK-LMUL8-NEXT: [[ADD4]] = add nuw nsw i64 [[I_011]], 1
-; CHECK-LMUL8-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[ADD4]], [[CONV]]
-; CHECK-LMUL8-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-;
+; CHECK-LMUL1: LV(REG): Found max usage: 2 item
+; CHECK-LMUL1-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers
+; CHECK-LMUL1-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 2 registers
+; CHECK-LMUL1-NEXT: LV(REG): Found invariant usage: 1 item
+; CHECK-LMUL1-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 2 registers
+; CHECK-LMUL2: LV(REG): Found max usage: 2 item
+; CHECK-LMUL2-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers
+; CHECK-LMUL2-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 4 registers
+; CHECK-LMUL2-NEXT: LV(REG): Found invariant usage: 1 item
+; CHECK-LMUL2-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 4 registers
+; CHECK-LMUL4: LV(REG): Found max usage: 2 item
+; CHECK-LMUL4-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers
+; CHECK-LMUL4-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 8 registers
+; CHECK-LMUL4-NEXT: LV(REG): Found invariant usage: 1 item
+; CHECK-LMUL4-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 8 registers
+; CHECK-LMUL8: LV(REG): Found max usage: 2 item
+; CHECK-LMUL8-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers
+; CHECK-LMUL8-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 16 registers
+; CHECK-LMUL8-NEXT: LV(REG): Found invariant usage: 1 item
+; CHECK-LMUL8-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 16 registers
entry:
%conv = zext i32 %size to i64
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -loop-vectorize -scalable-vectorization=on \
; RUN: -riscv-v-vector-bits-min=128 -riscv-v-vector-bits-max=128 \
; RUN: -pass-remarks=loop-vectorize -pass-remarks-analysis=loop-vectorize \
; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
define i32 @add(i32* nocapture %a, i32* nocapture readonly %b, i64 %n) {
-; CHECK-LABEL: @add(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ insertelement (<vscale x 8 x i32> zeroinitializer, i32 2, i32 0), [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP9]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <vscale x 8 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i32>, <vscale x 8 x i32>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], 8
-; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 [[TMP15]]
-; CHECK-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <vscale x 8 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i32>, <vscale x 8 x i32>* [[TMP17]], align 4
-; CHECK-NEXT: [[TMP18]] = add <vscale x 8 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP19]] = add <vscale x 8 x i32> [[WIDE_LOAD2]], [[VEC_PHI1]]
-; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 16
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP21]]
-; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[BIN_RDX:%.*]] = add <vscale x 8 x i32> [[TMP19]], [[TMP18]]
-; CHECK-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[BIN_RDX]])
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, [[ENTRY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[SUM_07:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
-; CHECK-NEXT: [[TMP24:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[ADD]] = add nsw i32 [[TMP24]], [[SUM_07]]
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: ret i32 [[ADD_LCSSA]]
-;
+; CHECK-LABEL: @add
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x i32>
+; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x i32>
+; CHECK: %[[ADD1:.*]] = add <vscale x 8 x i32> %[[LOAD1]]
+; CHECK: %[[ADD2:.*]] = add <vscale x 8 x i32> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[ADD:.*]] = add <vscale x 8 x i32> %[[ADD2]], %[[ADD1]]
+; CHECK-NEXT: call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> %[[ADD]])
entry:
br label %for.body
; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
define i32 @or(i32* nocapture %a, i32* nocapture readonly %b, i64 %n) {
-; CHECK-LABEL: @or(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ insertelement (<vscale x 8 x i32> zeroinitializer, i32 2, i32 0), [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP9]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <vscale x 8 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i32>, <vscale x 8 x i32>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], 8
-; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 [[TMP15]]
-; CHECK-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <vscale x 8 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i32>, <vscale x 8 x i32>* [[TMP17]], align 4
-; CHECK-NEXT: [[TMP18]] = or <vscale x 8 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP19]] = or <vscale x 8 x i32> [[WIDE_LOAD2]], [[VEC_PHI1]]
-; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 16
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP21]]
-; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[BIN_RDX:%.*]] = or <vscale x 8 x i32> [[TMP19]], [[TMP18]]
-; CHECK-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.or.nxv8i32(<vscale x 8 x i32> [[BIN_RDX]])
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, [[ENTRY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[SUM_07:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[OR:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
-; CHECK-NEXT: [[TMP24:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[OR]] = or i32 [[TMP24]], [[SUM_07]]
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: [[OR_LCSSA:%.*]] = phi i32 [ [[OR]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: ret i32 [[OR_LCSSA]]
-;
+; CHECK-LABEL: @or
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x i32>
+; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x i32>
+; CHECK: %[[OR1:.*]] = or <vscale x 8 x i32> %[[LOAD1]]
+; CHECK: %[[OR2:.*]] = or <vscale x 8 x i32> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[OR:.*]] = or <vscale x 8 x i32> %[[OR2]], %[[OR1]]
+; CHECK-NEXT: call i32 @llvm.vector.reduce.or.nxv8i32(<vscale x 8 x i32> %[[OR]])
entry:
br label %for.body
; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
define i32 @and(i32* nocapture %a, i32* nocapture readonly %b, i64 %n) {
-; CHECK-LABEL: @and(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ insertelement (<vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 -1, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer), i32 2, i32 0), [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 -1, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP9]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <vscale x 8 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i32>, <vscale x 8 x i32>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], 8
-; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 [[TMP15]]
-; CHECK-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <vscale x 8 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i32>, <vscale x 8 x i32>* [[TMP17]], align 4
-; CHECK-NEXT: [[TMP18]] = and <vscale x 8 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP19]] = and <vscale x 8 x i32> [[WIDE_LOAD2]], [[VEC_PHI1]]
-; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 16
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP21]]
-; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[BIN_RDX:%.*]] = and <vscale x 8 x i32> [[TMP19]], [[TMP18]]
-; CHECK-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.and.nxv8i32(<vscale x 8 x i32> [[BIN_RDX]])
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, [[ENTRY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[SUM_07:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[AND:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
-; CHECK-NEXT: [[TMP24:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[AND]] = and i32 [[TMP24]], [[SUM_07]]
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: [[AND_LCSSA:%.*]] = phi i32 [ [[AND]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: ret i32 [[AND_LCSSA]]
-;
+; CHECK-LABEL: @and
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x i32>
+; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x i32>
+; CHECK: %[[AND1:.*]] = and <vscale x 8 x i32> %[[LOAD1]]
+; CHECK: %[[AND2:.*]] = and <vscale x 8 x i32> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[ABD:.*]] = and <vscale x 8 x i32> %[[ADD2]], %[[AND1]]
+; CHECK-NEXT: call i32 @llvm.vector.reduce.and.nxv8i32(<vscale x 8 x i32> %[[ADD]])
entry:
br label %for.body
; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
define i32 @xor(i32* nocapture %a, i32* nocapture readonly %b, i64 %n) {
-; CHECK-LABEL: @xor(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ insertelement (<vscale x 8 x i32> zeroinitializer, i32 2, i32 0), [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP9]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <vscale x 8 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i32>, <vscale x 8 x i32>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], 8
-; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 [[TMP15]]
-; CHECK-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <vscale x 8 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i32>, <vscale x 8 x i32>* [[TMP17]], align 4
-; CHECK-NEXT: [[TMP18]] = xor <vscale x 8 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP19]] = xor <vscale x 8 x i32> [[WIDE_LOAD2]], [[VEC_PHI1]]
-; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 16
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP21]]
-; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[BIN_RDX:%.*]] = xor <vscale x 8 x i32> [[TMP19]], [[TMP18]]
-; CHECK-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.xor.nxv8i32(<vscale x 8 x i32> [[BIN_RDX]])
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, [[ENTRY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[SUM_07:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[XOR:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
-; CHECK-NEXT: [[TMP24:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[XOR]] = xor i32 [[TMP24]], [[SUM_07]]
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: [[XOR_LCSSA:%.*]] = phi i32 [ [[XOR]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: ret i32 [[XOR_LCSSA]]
-;
+; CHECK-LABEL: @xor
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x i32>
+; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x i32>
+; CHECK: %[[XOR1:.*]] = xor <vscale x 8 x i32> %[[LOAD1]]
+; CHECK: %[[XOR2:.*]] = xor <vscale x 8 x i32> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[XOR:.*]] = xor <vscale x 8 x i32> %[[XOR2]], %[[XOR1]]
+; CHECK-NEXT: call i32 @llvm.vector.reduce.xor.nxv8i32(<vscale x 8 x i32> %[[XOR]])
entry:
br label %for.body
; SMIN
define i32 @smin(i32* nocapture %a, i32* nocapture readonly %b, i64 %n) {
-; CHECK-LABEL: @smin(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 2, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 2, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP9]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <vscale x 8 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i32>, <vscale x 8 x i32>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], 8
-; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 [[TMP15]]
-; CHECK-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <vscale x 8 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i32>, <vscale x 8 x i32>* [[TMP17]], align 4
-; CHECK-NEXT: [[TMP18:%.*]] = icmp slt <vscale x 8 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP19:%.*]] = icmp slt <vscale x 8 x i32> [[WIDE_LOAD2]], [[VEC_PHI1]]
-; CHECK-NEXT: [[TMP20]] = select <vscale x 8 x i1> [[TMP18]], <vscale x 8 x i32> [[WIDE_LOAD]], <vscale x 8 x i32> [[VEC_PHI]]
-; CHECK-NEXT: [[TMP21]] = select <vscale x 8 x i1> [[TMP19]], <vscale x 8 x i32> [[WIDE_LOAD2]], <vscale x 8 x i32> [[VEC_PHI1]]
-; CHECK-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 16
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP23]]
-; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp slt <vscale x 8 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <vscale x 8 x i1> [[RDX_MINMAX_CMP]], <vscale x 8 x i32> [[TMP20]], <vscale x 8 x i32> [[TMP21]]
-; CHECK-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.smin.nxv8i32(<vscale x 8 x i32> [[RDX_MINMAX_SELECT]])
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, [[ENTRY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[SUM_010:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[DOTSROA_SPECULATED:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
-; CHECK-NEXT: [[TMP26:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[CMP_I:%.*]] = icmp slt i32 [[TMP26]], [[SUM_010]]
-; CHECK-NEXT: [[DOTSROA_SPECULATED]] = select i1 [[CMP_I]], i32 [[TMP26]], i32 [[SUM_010]]
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: [[DOTSROA_SPECULATED_LCSSA:%.*]] = phi i32 [ [[DOTSROA_SPECULATED]], [[FOR_BODY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: ret i32 [[DOTSROA_SPECULATED_LCSSA]]
-;
+; CHECK-LABEL: @smin
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x i32>
+; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x i32>
+; CHECK: %[[ICMP1:.*]] = icmp slt <vscale x 8 x i32> %[[LOAD1]]
+; CHECK: %[[ICMP2:.*]] = icmp slt <vscale x 8 x i32> %[[LOAD2]]
+; CHECK: %[[SEL1:.*]] = select <vscale x 8 x i1> %[[ICMP1]], <vscale x 8 x i32> %[[LOAD1]]
+; CHECK: %[[SEL2:.*]] = select <vscale x 8 x i1> %[[ICMP2]], <vscale x 8 x i32> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[ICMP:.*]] = icmp slt <vscale x 8 x i32> %[[SEL1]], %[[SEL2]]
+; CHECK-NEXT: %[[SEL:.*]] = select <vscale x 8 x i1> %[[ICMP]], <vscale x 8 x i32> %[[SEL1]], <vscale x 8 x i32> %[[SEL2]]
+; CHECK-NEXT: call i32 @llvm.vector.reduce.smin.nxv8i32(<vscale x 8 x i32> %[[SEL]])
entry:
br label %for.body
; UMAX
define i32 @umax(i32* nocapture %a, i32* nocapture readonly %b, i64 %n) {
-; CHECK-LABEL: @umax(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 2, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 2, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer), [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP9]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <vscale x 8 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i32>, <vscale x 8 x i32>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], 8
-; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 [[TMP15]]
-; CHECK-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <vscale x 8 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i32>, <vscale x 8 x i32>* [[TMP17]], align 4
-; CHECK-NEXT: [[TMP18:%.*]] = icmp ugt <vscale x 8 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP19:%.*]] = icmp ugt <vscale x 8 x i32> [[WIDE_LOAD2]], [[VEC_PHI1]]
-; CHECK-NEXT: [[TMP20]] = select <vscale x 8 x i1> [[TMP18]], <vscale x 8 x i32> [[WIDE_LOAD]], <vscale x 8 x i32> [[VEC_PHI]]
-; CHECK-NEXT: [[TMP21]] = select <vscale x 8 x i1> [[TMP19]], <vscale x 8 x i32> [[WIDE_LOAD2]], <vscale x 8 x i32> [[VEC_PHI1]]
-; CHECK-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 16
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP23]]
-; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp ugt <vscale x 8 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <vscale x 8 x i1> [[RDX_MINMAX_CMP]], <vscale x 8 x i32> [[TMP20]], <vscale x 8 x i32> [[TMP21]]
-; CHECK-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.umax.nxv8i32(<vscale x 8 x i32> [[RDX_MINMAX_SELECT]])
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, [[ENTRY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[SUM_010:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[DOTSROA_SPECULATED:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
-; CHECK-NEXT: [[TMP26:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[CMP_I:%.*]] = icmp ugt i32 [[TMP26]], [[SUM_010]]
-; CHECK-NEXT: [[DOTSROA_SPECULATED]] = select i1 [[CMP_I]], i32 [[TMP26]], i32 [[SUM_010]]
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: [[DOTSROA_SPECULATED_LCSSA:%.*]] = phi i32 [ [[DOTSROA_SPECULATED]], [[FOR_BODY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: ret i32 [[DOTSROA_SPECULATED_LCSSA]]
-;
+; CHECK-LABEL: @umax
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x i32>
+; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x i32>
+; CHECK: %[[ICMP1:.*]] = icmp ugt <vscale x 8 x i32> %[[LOAD1]]
+; CHECK: %[[ICMP2:.*]] = icmp ugt <vscale x 8 x i32> %[[LOAD2]]
+; CHECK: %[[SEL1:.*]] = select <vscale x 8 x i1> %[[ICMP1]], <vscale x 8 x i32> %[[LOAD1]]
+; CHECK: %[[SEL2:.*]] = select <vscale x 8 x i1> %[[ICMP2]], <vscale x 8 x i32> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[ICMP:.*]] = icmp ugt <vscale x 8 x i32> %[[SEL1]], %[[SEL2]]
+; CHECK-NEXT: %[[SEL:.*]] = select <vscale x 8 x i1> %[[ICMP]], <vscale x 8 x i32> %[[SEL1]], <vscale x 8 x i32> %[[SEL2]]
+; CHECK-NEXT: call i32 @llvm.vector.reduce.umax.nxv8i32(<vscale x 8 x i32> %[[SEL]])
entry:
br label %for.body
; FADD (FAST)
define float @fadd_fast(float* noalias nocapture readonly %a, i64 %n) {
-; CHECK-LABEL: @fadd_fast(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> zeroinitializer, float 0.000000e+00, i32 0), [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 8 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP9]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <vscale x 8 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], 8
-; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 [[TMP15]]
-; CHECK-NEXT: [[TMP17:%.*]] = bitcast float* [[TMP16]] to <vscale x 8 x float>*
-; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP17]], align 4
-; CHECK-NEXT: [[TMP18]] = fadd fast <vscale x 8 x float> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP19]] = fadd fast <vscale x 8 x float> [[WIDE_LOAD2]], [[VEC_PHI1]]
-; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 16
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP21]]
-; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <vscale x 8 x float> [[TMP19]], [[TMP18]]
-; CHECK-NEXT: [[TMP23:%.*]] = call fast float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> [[BIN_RDX]])
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-NEXT: [[TMP24:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[ADD]] = fadd fast float [[TMP24]], [[SUM_07]]
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: ret float [[ADD_LCSSA]]
-;
+; CHECK-LABEL: @fadd_fast
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x float>
+; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x float>
+; CHECK: %[[ADD1:.*]] = fadd fast <vscale x 8 x float> %[[LOAD1]]
+; CHECK: %[[ADD2:.*]] = fadd fast <vscale x 8 x float> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[ADD:.*]] = fadd fast <vscale x 8 x float> %[[ADD2]], %[[ADD1]]
+; CHECK-NEXT: call fast float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> %[[ADD]])
entry:
br label %for.body
; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop.
; CHECK-REMARK: vectorized loop (vectorization width: 8, interleaved count: 2)
define bfloat @fadd_fast_bfloat(bfloat* noalias nocapture readonly %a, i64 %n) {
-; CHECK-LABEL: @fadd_fast_bfloat(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 16
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <8 x bfloat> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <8 x bfloat> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds bfloat, bfloat* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds bfloat, bfloat* [[A]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds bfloat, bfloat* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast bfloat* [[TMP4]] to <8 x bfloat>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x bfloat>, <8 x bfloat>* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds bfloat, bfloat* [[TMP2]], i32 8
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast bfloat* [[TMP6]] to <8 x bfloat>*
-; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x bfloat>, <8 x bfloat>* [[TMP7]], align 4
-; CHECK-NEXT: [[TMP8]] = fadd fast <8 x bfloat> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP9]] = fadd fast <8 x bfloat> [[WIDE_LOAD2]], [[VEC_PHI1]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <8 x bfloat> [[TMP9]], [[TMP8]]
-; CHECK-NEXT: [[TMP11:%.*]] = call fast bfloat @llvm.vector.reduce.fadd.v8bf16(bfloat 0xR8000, <8 x bfloat> [[BIN_RDX]])
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi bfloat [ 0xR0000, [[ENTRY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[SUM_07:%.*]] = phi bfloat [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds bfloat, bfloat* [[A]], i64 [[IV]]
-; CHECK-NEXT: [[TMP12:%.*]] = load bfloat, bfloat* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[ADD]] = fadd fast bfloat [[TMP12]], [[SUM_07]]
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi bfloat [ [[ADD]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: ret bfloat [[ADD_LCSSA]]
-;
+; CHECK-LABEL: @fadd_fast_bfloat
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <8 x bfloat>
+; CHECK: %[[LOAD2:.*]] = load <8 x bfloat>
+; CHECK: %[[FADD1:.*]] = fadd fast <8 x bfloat> %[[LOAD1]]
+; CHECK: %[[FADD2:.*]] = fadd fast <8 x bfloat> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[RDX:.*]] = fadd fast <8 x bfloat> %[[FADD2]], %[[FADD1]]
+; CHECK: call fast bfloat @llvm.vector.reduce.fadd.v8bf16(bfloat 0xR8000, <8 x bfloat> %[[RDX]])
entry:
br label %for.body
; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
define float @fmin_fast(float* noalias nocapture readonly %a, i64 %n) #0 {
-; CHECK-LABEL: @fmin_fast(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 8 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP9]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <vscale x 8 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], 8
-; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 [[TMP15]]
-; CHECK-NEXT: [[TMP17:%.*]] = bitcast float* [[TMP16]] to <vscale x 8 x float>*
-; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP17]], align 4
-; CHECK-NEXT: [[TMP18:%.*]] = fcmp olt <vscale x 8 x float> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP19:%.*]] = fcmp olt <vscale x 8 x float> [[WIDE_LOAD2]], [[VEC_PHI1]]
-; CHECK-NEXT: [[TMP20]] = select <vscale x 8 x i1> [[TMP18]], <vscale x 8 x float> [[WIDE_LOAD]], <vscale x 8 x float> [[VEC_PHI]]
-; CHECK-NEXT: [[TMP21]] = select <vscale x 8 x i1> [[TMP19]], <vscale x 8 x float> [[WIDE_LOAD2]], <vscale x 8 x float> [[VEC_PHI1]]
-; CHECK-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 16
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP23]]
-; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp olt <vscale x 8 x float> [[TMP20]], [[TMP21]]
-; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <vscale x 8 x i1> [[RDX_MINMAX_CMP]], <vscale x 8 x float> [[TMP20]], <vscale x 8 x float> [[TMP21]]
-; CHECK-NEXT: [[TMP25:%.*]] = call float @llvm.vector.reduce.fmin.nxv8f32(<vscale x 8 x float> [[RDX_MINMAX_SELECT]])
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[DOTSROA_SPECULATED:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-NEXT: [[TMP26:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[CMP_I:%.*]] = fcmp olt float [[TMP26]], [[SUM_07]]
-; CHECK-NEXT: [[DOTSROA_SPECULATED]] = select i1 [[CMP_I]], float [[TMP26]], float [[SUM_07]]
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: [[DOTSROA_SPECULATED_LCSSA:%.*]] = phi float [ [[DOTSROA_SPECULATED]], [[FOR_BODY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: ret float [[DOTSROA_SPECULATED_LCSSA]]
-;
+; CHECK-LABEL: @fmin_fast
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x float>
+; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x float>
+; CHECK: %[[FCMP1:.*]] = fcmp olt <vscale x 8 x float> %[[LOAD1]]
+; CHECK: %[[FCMP2:.*]] = fcmp olt <vscale x 8 x float> %[[LOAD2]]
+; CHECK: %[[SEL1:.*]] = select <vscale x 8 x i1> %[[FCMP1]], <vscale x 8 x float> %[[LOAD1]]
+; CHECK: %[[SEL2:.*]] = select <vscale x 8 x i1> %[[FCMP2]], <vscale x 8 x float> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[FCMP:.*]] = fcmp olt <vscale x 8 x float> %[[SEL1]], %[[SEL2]]
+; CHECK-NEXT: %[[SEL:.*]] = select <vscale x 8 x i1> %[[FCMP]], <vscale x 8 x float> %[[SEL1]], <vscale x 8 x float> %[[SEL2]]
+; CHECK-NEXT: call float @llvm.vector.reduce.fmin.nxv8f32(<vscale x 8 x float> %[[SEL]])
entry:
br label %for.body
; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
define float @fmax_fast(float* noalias nocapture readonly %a, i64 %n) #0 {
-; CHECK-LABEL: @fmax_fast(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 8 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8
-; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP9]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <vscale x 8 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], 8
-; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 [[TMP15]]
-; CHECK-NEXT: [[TMP17:%.*]] = bitcast float* [[TMP16]] to <vscale x 8 x float>*
-; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>* [[TMP17]], align 4
-; CHECK-NEXT: [[TMP18:%.*]] = fcmp fast ogt <vscale x 8 x float> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP19:%.*]] = fcmp fast ogt <vscale x 8 x float> [[WIDE_LOAD2]], [[VEC_PHI1]]
-; CHECK-NEXT: [[TMP20]] = select <vscale x 8 x i1> [[TMP18]], <vscale x 8 x float> [[WIDE_LOAD]], <vscale x 8 x float> [[VEC_PHI]]
-; CHECK-NEXT: [[TMP21]] = select <vscale x 8 x i1> [[TMP19]], <vscale x 8 x float> [[WIDE_LOAD2]], <vscale x 8 x float> [[VEC_PHI1]]
-; CHECK-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 16
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP23]]
-; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <vscale x 8 x float> [[TMP20]], [[TMP21]]
-; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select fast <vscale x 8 x i1> [[RDX_MINMAX_CMP]], <vscale x 8 x float> [[TMP20]], <vscale x 8 x float> [[TMP21]]
-; CHECK-NEXT: [[TMP25:%.*]] = call fast float @llvm.vector.reduce.fmax.nxv8f32(<vscale x 8 x float> [[RDX_MINMAX_SELECT]])
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[SUM_07:%.*]] = phi float [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[DOTSROA_SPECULATED:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[IV]]
-; CHECK-NEXT: [[TMP26:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[CMP_I:%.*]] = fcmp fast ogt float [[TMP26]], [[SUM_07]]
-; CHECK-NEXT: [[DOTSROA_SPECULATED]] = select i1 [[CMP_I]], float [[TMP26]], float [[SUM_07]]
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: [[DOTSROA_SPECULATED_LCSSA:%.*]] = phi float [ [[DOTSROA_SPECULATED]], [[FOR_BODY]] ], [ [[TMP25]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: ret float [[DOTSROA_SPECULATED_LCSSA]]
-;
+; CHECK-LABEL: @fmax_fast
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x float>
+; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x float>
+; CHECK: %[[FCMP1:.*]] = fcmp fast ogt <vscale x 8 x float> %[[LOAD1]]
+; CHECK: %[[FCMP2:.*]] = fcmp fast ogt <vscale x 8 x float> %[[LOAD2]]
+; CHECK: %[[SEL1:.*]] = select <vscale x 8 x i1> %[[FCMP1]], <vscale x 8 x float> %[[LOAD1]]
+; CHECK: %[[SEL2:.*]] = select <vscale x 8 x i1> %[[FCMP2]], <vscale x 8 x float> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[FCMP:.*]] = fcmp fast ogt <vscale x 8 x float> %[[SEL1]], %[[SEL2]]
+; CHECK-NEXT: %[[SEL:.*]] = select fast <vscale x 8 x i1> %[[FCMP]], <vscale x 8 x float> %[[SEL1]], <vscale x 8 x float> %[[SEL2]]
+; CHECK-NEXT: call fast float @llvm.vector.reduce.fmax.nxv8f32(<vscale x 8 x float> %[[SEL]])
entry:
br label %for.body
; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop.
; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 2)
define i32 @mul(i32* nocapture %a, i32* nocapture readonly %b, i64 %n) {
-; CHECK-LABEL: @mul(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 2, i32 1, i32 1, i32 1>, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ <i32 1, i32 1, i32 1, i32 1>, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 4
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4
-; CHECK-NEXT: [[TMP8]] = mul <4 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP9]] = mul <4 x i32> [[WIDE_LOAD2]], [[VEC_PHI1]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[BIN_RDX:%.*]] = mul <4 x i32> [[TMP9]], [[TMP8]]
-; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[BIN_RDX]])
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, [[ENTRY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[SUM_07:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[MUL:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
-; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[MUL]] = mul nsw i32 [[TMP12]], [[SUM_07]]
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: [[MUL_LCSSA:%.*]] = phi i32 [ [[MUL]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: ret i32 [[MUL_LCSSA]]
-;
+; CHECK-LABEL: @mul
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <4 x i32>
+; CHECK: %[[LOAD2:.*]] = load <4 x i32>
+; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD1]]
+; CHECK: %[[MUL2:.*]] = mul <4 x i32> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[RDX:.*]] = mul <4 x i32> %[[MUL2]], %[[MUL1]]
+; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[RDX]])
entry:
br label %for.body
; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop.
; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 2)
define i32 @memory_dependence(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i64 %n) {
-; CHECK-LABEL: @memory_dependence(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 2, i32 1, i32 1, i32 1>, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ <i32 1, i32 1, i32 1, i32 1>, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 4
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 4
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[WIDE_LOAD3]], [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[WIDE_LOAD4]], [[WIDE_LOAD2]]
-; CHECK-NEXT: [[TMP16:%.*]] = add nuw nsw i64 [[TMP0]], 32
-; CHECK-NEXT: [[TMP17:%.*]] = add nuw nsw i64 [[TMP1]], 32
-; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP16]]
-; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP17]]
-; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP18]], i32 0
-; CHECK-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <4 x i32>*
-; CHECK-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP21]], align 4
-; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP18]], i32 4
-; CHECK-NEXT: [[TMP23:%.*]] = bitcast i32* [[TMP22]] to <4 x i32>*
-; CHECK-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP23]], align 4
-; CHECK-NEXT: [[TMP24]] = mul <4 x i32> [[WIDE_LOAD3]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP25]] = mul <4 x i32> [[WIDE_LOAD4]], [[VEC_PHI1]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[BIN_RDX:%.*]] = mul <4 x i32> [[TMP25]], [[TMP24]]
-; CHECK-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[BIN_RDX]])
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 2, [[ENTRY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[SUM:%.*]] = phi i32 [ [[MUL:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I]]
-; CHECK-NEXT: [[TMP28:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
-; CHECK-NEXT: [[TMP29:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4
-; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP29]], [[TMP28]]
-; CHECK-NEXT: [[ADD2:%.*]] = add nuw nsw i64 [[I]], 32
-; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[ADD2]]
-; CHECK-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX3]], align 4
-; CHECK-NEXT: [[MUL]] = mul nsw i32 [[TMP29]], [[SUM]]
-; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: [[MUL_LCSSA:%.*]] = phi i32 [ [[MUL]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: ret i32 [[MUL_LCSSA]]
-;
+; CHECK-LABEL: @memory_dependence
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <4 x i32>
+; CHECK: %[[LOAD2:.*]] = load <4 x i32>
+; CHECK: %[[LOAD3:.*]] = load <4 x i32>
+; CHECK: %[[LOAD4:.*]] = load <4 x i32>
+; CHECK: %[[ADD1:.*]] = add nsw <4 x i32> %[[LOAD3]], %[[LOAD1]]
+; CHECK: %[[ADD2:.*]] = add nsw <4 x i32> %[[LOAD4]], %[[LOAD2]]
+; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD3]]
+; CHECK: %[[MUL2:.*]] = mul <4 x i32> %[[LOAD4]]
+; CHECK: middle.block:
+; CHECK: %[[RDX:.*]] = mul <4 x i32> %[[MUL2]], %[[MUL1]]
+; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[RDX]])
entry:
br label %for.body
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 6
-; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[TMP12]], i32 1
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <2 x i32> [[TMP12]], i32 0
; CHECK-NEXT: store i32 [[SCALAR_RECUR]], i32* [[B_PTR]], align 4
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 5
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], [[LOOP2:!llvm.loop !.*]]
; CHECK: exit:
; CHECK-NEXT: ret void
;
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -loop-vectorize -mtriple=s390x-linux-gnu -tiny-trip-count-interleave-threshold=4 -vectorizer-min-trip-count=8 < %s | FileCheck %s
; RUN: opt -S -passes=loop-vectorize -mtriple=s390x-linux-gnu -tiny-trip-count-interleave-threshold=4 -vectorizer-min-trip-count=8 < %s | FileCheck %s
define i32 @main(i32 %arg, i8** nocapture readnone %arg1) #0 {
-; CHECK-LABEL: @main(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = alloca i8, align 1
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i8
-; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[OFFSET_IDX]], 0
-; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[OFFSET_IDX]], 1
-; CHECK-NEXT: [[TMP3:%.*]] = add i8 [[OFFSET_IDX]], 2
-; CHECK-NEXT: [[TMP4:%.*]] = add i8 [[OFFSET_IDX]], 3
-; CHECK-NEXT: [[TMP5:%.*]] = add i8 [[OFFSET_IDX]], 4
-; CHECK-NEXT: [[TMP6:%.*]] = add i8 [[OFFSET_IDX]], 5
-; CHECK-NEXT: [[TMP7:%.*]] = add i8 [[OFFSET_IDX]], 6
-; CHECK-NEXT: [[TMP8:%.*]] = add i8 [[OFFSET_IDX]], 7
-; CHECK-NEXT: store i8 [[TMP1]], i8* [[TMP0]], align 2
-; CHECK-NEXT: store i8 [[TMP2]], i8* [[TMP0]], align 2
-; CHECK-NEXT: store i8 [[TMP3]], i8* [[TMP0]], align 2
-; CHECK-NEXT: store i8 [[TMP4]], i8* [[TMP0]], align 2
-; CHECK-NEXT: store i8 [[TMP5]], i8* [[TMP0]], align 2
-; CHECK-NEXT: store i8 [[TMP6]], i8* [[TMP0]], align 2
-; CHECK-NEXT: store i8 [[TMP7]], i8* [[TMP0]], align 2
-; CHECK-NEXT: store i8 [[TMP8]], i8* [[TMP0]], align 2
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
-; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 8
-; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 9, 8
-; CHECK-NEXT: br i1 [[CMP_N]], label [[RET:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ 8, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[LOOP:%.*]]
-; CHECK: loop:
-; CHECK-NEXT: [[STOREMERGE_I_I:%.*]] = phi i8 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[TMP12_I_I:%.*]], [[LOOP]] ]
-; CHECK-NEXT: store i8 [[STOREMERGE_I_I]], i8* [[TMP0]], align 2
-; CHECK-NEXT: [[TMP8_I_I:%.*]] = icmp ult i8 [[STOREMERGE_I_I]], 8
-; CHECK-NEXT: [[TMP12_I_I]] = add nuw nsw i8 [[STOREMERGE_I_I]], 1
-; CHECK-NEXT: br i1 [[TMP8_I_I]], label [[LOOP]], label [[RET]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: ret:
-; CHECK-NEXT: ret i32 0
-;
+;CHECK: vector.body:
entry:
%0 = alloca i8, align 1
br label %loop
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -disable-loop-unrolling -debug-only=loop-vectorize -passes='default<O3>' -S 2>&1 | FileCheck %s
; RUN: opt < %s -disable-loop-unrolling -debug-only=loop-vectorize -O3 -S 2>&1 | FileCheck %s
; REQUIRES: asserts
; Function Attrs: nounwind readonly uwtable
define i32 @vect() {
-; CHECK-LABEL: @vect(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [255 x i32], [255 x i32]* @a, i64 0, i64 [[INDEX]]
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT: [[TMP4]] = add <4 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP5]] = add <4 x i32> [[WIDE_LOAD2]], [[VEC_PHI1]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 248
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP5]], [[TMP4]]
-; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 248, [[MIDDLE_BLOCK]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[RED_05:%.*]] = phi i32 [ [[TMP7]], [[MIDDLE_BLOCK]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [255 x i32], [255 x i32]* @a, i64 0, i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[ADD]] = add nsw i32 [[TMP8]], [[RED_05]]
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 255
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret i32 [[ADD]]
-;
+; CHECK: LV: Checking a loop in 'vect'
entry:
br label %for.body
for.body: ; preds = %for.body, %entry
; We need to make sure we did vectorize the loop
+; CHECK: LV: Found a loop: for.body
+; CHECK: LV: We can vectorize this loop!
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%red.05 = phi i32 [ 0, %entry ], [ %add, %for.body ]
%arrayidx = getelementptr inbounds [255 x i32], [255 x i32]* @a, i64 0, i64 %indvars.iv
br i1 %exitcond, label %for.end, label %for.body
; If it did, we have two loops:
+; CHECK: vector.body:
+; CHECK: br {{.*}} label %vector.body, !llvm.loop [[vect:![0-9]+]]
+; CHECK: for.body:
+; CHECK: br {{.*}} label %for.body{{.*}}, !llvm.loop [[scalar:![0-9]+]]
for.end: ; preds = %for.body
ret i32 %add
}
; Now, we check for the Hint metadata
+; CHECK: [[vect]] = distinct !{[[vect]], [[width:![0-9]+]]}
+; CHECK: [[width]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[scalar]] = distinct !{[[scalar]], [[runtime_unroll:![0-9]+]], [[width]]}
+; CHECK: [[runtime_unroll]] = !{!"llvm.loop.unroll.runtime.disable"}
; CHECK-NEXT: store <2 x i16*> <i16* getelementptr inbounds ([1 x %rec8], [1 x %rec8]* @a, i32 0, i32 0, i32 0), i16* getelementptr inbounds ([1 x %rec8], [1 x %rec8]* @a, i32 0, i32 0, i32 0)>, <2 x i16*>* [[TMP4]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 2
-; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 2, 2
; CHECK-NEXT: br i1 [[CMP_N]], label [[BB3:%.*]], label [[SCALAR_PH]]
; CHECK-NEXT: store i16* [[_TMP4]], i16** [[_TMP7]], align 8
; CHECK-NEXT: [[_TMP9]] = add nsw i16 [[C_1_0]], 1
; CHECK-NEXT: [[_TMP11:%.*]] = icmp slt i16 [[_TMP9]], 2
-; CHECK-NEXT: br i1 [[_TMP11]], label [[BB2]], label [[BB3]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT: br i1 [[_TMP11]], label [[BB2]], label [[BB3]], [[LOOP2:!llvm.loop !.*]]
; CHECK: bb3:
; CHECK-NEXT: ret void
;
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <32 x i8> [ <i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND1:%.*]] = phi <32 x i8> [ <i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23, i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 3, [[INDEX]]
; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[TMP5]]
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TMP6]], i32 0
; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <32 x i8>*
-; CHECK-NEXT: store <32 x i8> [[VEC_IND]], <32 x i8>* [[TMP8]], align 1
+; CHECK-NEXT: store <32 x i8> [[VEC_IND1]], <32 x i8>* [[TMP8]], align 1
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <32 x i8> [[VEC_IND]], <i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32>
+; CHECK-NEXT: [[VEC_IND_NEXT2]] = add <32 x i8> [[VEC_IND1]], <i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32, i8 32>
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[STEP_ADD2:%.*]] = add <2 x i64> [[STEP_ADD1]], <i64 2, i64 2>
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 9, [[INDEX]]
; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 2
-; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 4
-; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 6
-; CHECK-NEXT: [[TMP9:%.*]] = add nsw <2 x i64> [[VEC_IND]], <i64 3, i64 3>
-; CHECK-NEXT: [[TMP10:%.*]] = add nsw <2 x i64> [[STEP_ADD]], <i64 3, i64 3>
-; CHECK-NEXT: [[TMP11:%.*]] = add nsw <2 x i64> [[STEP_ADD1]], <i64 3, i64 3>
-; CHECK-NEXT: [[TMP12:%.*]] = add nsw <2 x i64> [[STEP_ADD2]], <i64 3, i64 3>
-; CHECK-NEXT: [[TMP13:%.*]] = sitofp <2 x i64> [[TMP9]] to <2 x float>
-; CHECK-NEXT: [[TMP14:%.*]] = sitofp <2 x i64> [[TMP10]] to <2 x float>
-; CHECK-NEXT: [[TMP15:%.*]] = sitofp <2 x i64> [[TMP11]] to <2 x float>
-; CHECK-NEXT: [[TMP16:%.*]] = sitofp <2 x i64> [[TMP12]] to <2 x float>
-; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP5]]
-; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP6]]
-; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP7]]
-; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP8]]
-; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[TMP17]], i32 0
-; CHECK-NEXT: [[TMP22:%.*]] = bitcast float* [[TMP21]] to <2 x float>*
-; CHECK-NEXT: store <2 x float> [[TMP13]], <2 x float>* [[TMP22]], align 4
-; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, float* [[TMP17]], i32 2
-; CHECK-NEXT: [[TMP24:%.*]] = bitcast float* [[TMP23]] to <2 x float>*
-; CHECK-NEXT: store <2 x float> [[TMP14]], <2 x float>* [[TMP24]], align 4
-; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, float* [[TMP17]], i32 4
+; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 4
+; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 6
+; CHECK-NEXT: [[TMP13:%.*]] = add nsw <2 x i64> [[VEC_IND]], <i64 3, i64 3>
+; CHECK-NEXT: [[TMP14:%.*]] = add nsw <2 x i64> [[STEP_ADD]], <i64 3, i64 3>
+; CHECK-NEXT: [[TMP15:%.*]] = add nsw <2 x i64> [[STEP_ADD1]], <i64 3, i64 3>
+; CHECK-NEXT: [[TMP16:%.*]] = add nsw <2 x i64> [[STEP_ADD2]], <i64 3, i64 3>
+; CHECK-NEXT: [[TMP17:%.*]] = sitofp <2 x i64> [[TMP13]] to <2 x float>
+; CHECK-NEXT: [[TMP18:%.*]] = sitofp <2 x i64> [[TMP14]] to <2 x float>
+; CHECK-NEXT: [[TMP19:%.*]] = sitofp <2 x i64> [[TMP15]] to <2 x float>
+; CHECK-NEXT: [[TMP20:%.*]] = sitofp <2 x i64> [[TMP16]] to <2 x float>
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP5]]
+; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP7]]
+; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP9]]
+; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP11]]
+; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, float* [[TMP21]], i32 0
; CHECK-NEXT: [[TMP26:%.*]] = bitcast float* [[TMP25]] to <2 x float>*
-; CHECK-NEXT: store <2 x float> [[TMP15]], <2 x float>* [[TMP26]], align 4
-; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, float* [[TMP17]], i32 6
+; CHECK-NEXT: store <2 x float> [[TMP17]], <2 x float>* [[TMP26]], align 4
+; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, float* [[TMP21]], i32 2
; CHECK-NEXT: [[TMP28:%.*]] = bitcast float* [[TMP27]] to <2 x float>*
-; CHECK-NEXT: store <2 x float> [[TMP16]], <2 x float>* [[TMP28]], align 4
+; CHECK-NEXT: store <2 x float> [[TMP18]], <2 x float>* [[TMP28]], align 4
+; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float* [[TMP21]], i32 4
+; CHECK-NEXT: [[TMP30:%.*]] = bitcast float* [[TMP29]] to <2 x float>*
+; CHECK-NEXT: store <2 x float> [[TMP19]], <2 x float>* [[TMP30]], align 4
+; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds float, float* [[TMP21]], i32 6
+; CHECK-NEXT: [[TMP32:%.*]] = bitcast float* [[TMP31]] to <2 x float>*
+; CHECK-NEXT: store <2 x float> [[TMP20]], <2 x float>* [[TMP32]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD2]], <i64 2, i64 2>
-; CHECK-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT: [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP4]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, i8* null, i64 [[TMP3]]
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[X:%.*]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i8> poison, i8 [[X]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT1]], <4 x i8> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i8> poison, i8 [[X]], i32 0
+; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT2]], <4 x i8> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT]] to <4 x i32>
-; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT2]] to <4 x i32>
-; CHECK-NEXT: [[TMP6:%.*]] = shl nuw <4 x i32> [[TMP4]], <i32 24, i32 24, i32 24, i32 24>
-; CHECK-NEXT: [[TMP7:%.*]] = shl nuw <4 x i32> [[TMP5]], <i32 24, i32 24, i32 24, i32 24>
-; CHECK-NEXT: [[TMP8:%.*]] = load i8, i8* [[P:%.*]], align 1, !tbaa [[TBAA1:![0-9]+]]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i8> poison, i8 [[TMP8]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT3]], <4 x i8> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP9:%.*]] = load i8, i8* [[P]], align 1, !tbaa [[TBAA1]]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i8> poison, i8 [[TMP9]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT5]], <4 x i8> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP10:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT4]] to <4 x i32>
-; CHECK-NEXT: [[TMP11:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT6]] to <4 x i32>
-; CHECK-NEXT: [[TMP12:%.*]] = shl nuw nsw <4 x i32> [[TMP10]], <i32 16, i32 16, i32 16, i32 16>
-; CHECK-NEXT: [[TMP13:%.*]] = shl nuw nsw <4 x i32> [[TMP11]], <i32 16, i32 16, i32 16, i32 16>
-; CHECK-NEXT: [[TMP14:%.*]] = or <4 x i32> [[TMP12]], [[TMP6]]
-; CHECK-NEXT: [[TMP15:%.*]] = or <4 x i32> [[TMP13]], [[TMP7]]
-; CHECK-NEXT: [[TMP16:%.*]] = load i8, i8* undef, align 1, !tbaa [[TBAA1]]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i8> poison, i8 [[TMP16]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT7]], <4 x i8> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP17:%.*]] = load i8, i8* undef, align 1, !tbaa [[TBAA1]]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <4 x i8> poison, i8 [[TMP17]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT9]], <4 x i8> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP18:%.*]] = or <4 x i32> [[TMP14]], zeroinitializer
-; CHECK-NEXT: [[TMP19:%.*]] = or <4 x i32> [[TMP15]], zeroinitializer
-; CHECK-NEXT: [[TMP20:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT8]] to <4 x i32>
-; CHECK-NEXT: [[TMP21:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT10]] to <4 x i32>
-; CHECK-NEXT: [[TMP22:%.*]] = or <4 x i32> [[TMP18]], [[TMP20]]
-; CHECK-NEXT: [[TMP23:%.*]] = or <4 x i32> [[TMP19]], [[TMP21]]
-; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i32> [[TMP22]], i32 0
-; CHECK-NEXT: store i32 [[TMP24]], i32* undef, align 4, !tbaa [[TBAA4:![0-9]+]]
-; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i32> [[TMP22]], i32 1
-; CHECK-NEXT: store i32 [[TMP25]], i32* undef, align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i32> [[TMP22]], i32 2
-; CHECK-NEXT: store i32 [[TMP26]], i32* undef, align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i32> [[TMP22]], i32 3
-; CHECK-NEXT: store i32 [[TMP27]], i32* undef, align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i32> [[TMP23]], i32 0
-; CHECK-NEXT: store i32 [[TMP28]], i32* undef, align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x i32> [[TMP23]], i32 1
+; CHECK-NEXT: [[TMP8:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT]] to <4 x i32>
+; CHECK-NEXT: [[TMP9:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT3]] to <4 x i32>
+; CHECK-NEXT: [[TMP10:%.*]] = shl nuw <4 x i32> [[TMP8]], <i32 24, i32 24, i32 24, i32 24>
+; CHECK-NEXT: [[TMP11:%.*]] = shl nuw <4 x i32> [[TMP9]], <i32 24, i32 24, i32 24, i32 24>
+; CHECK-NEXT: [[TMP12:%.*]] = load i8, i8* [[P:%.*]], align 1, !tbaa [[TBAA1:![0-9]+]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i8> poison, i8 [[TMP12]], i32 0
+; CHECK-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT4]], <4 x i8> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP13:%.*]] = load i8, i8* [[P]], align 1, !tbaa [[TBAA1]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x i8> poison, i8 [[TMP13]], i32 0
+; CHECK-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT6]], <4 x i8> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP14:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT5]] to <4 x i32>
+; CHECK-NEXT: [[TMP15:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT7]] to <4 x i32>
+; CHECK-NEXT: [[TMP16:%.*]] = shl nuw nsw <4 x i32> [[TMP14]], <i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT: [[TMP17:%.*]] = shl nuw nsw <4 x i32> [[TMP15]], <i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT: [[TMP18:%.*]] = or <4 x i32> [[TMP16]], [[TMP10]]
+; CHECK-NEXT: [[TMP19:%.*]] = or <4 x i32> [[TMP17]], [[TMP11]]
+; CHECK-NEXT: [[TMP20:%.*]] = load i8, i8* undef, align 1, !tbaa [[TBAA1]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i8> poison, i8 [[TMP20]], i32 0
+; CHECK-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT8]], <4 x i8> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP21:%.*]] = load i8, i8* undef, align 1, !tbaa [[TBAA1]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <4 x i8> poison, i8 [[TMP21]], i32 0
+; CHECK-NEXT: [[BROADCAST_SPLAT11:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT10]], <4 x i8> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP22:%.*]] = or <4 x i32> [[TMP18]], zeroinitializer
+; CHECK-NEXT: [[TMP23:%.*]] = or <4 x i32> [[TMP19]], zeroinitializer
+; CHECK-NEXT: [[TMP24:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT9]] to <4 x i32>
+; CHECK-NEXT: [[TMP25:%.*]] = zext <4 x i8> [[BROADCAST_SPLAT11]] to <4 x i32>
+; CHECK-NEXT: [[TMP26:%.*]] = or <4 x i32> [[TMP22]], [[TMP24]]
+; CHECK-NEXT: [[TMP27:%.*]] = or <4 x i32> [[TMP23]], [[TMP25]]
+; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i32> [[TMP26]], i32 0
+; CHECK-NEXT: store i32 [[TMP28]], i32* undef, align 4, !tbaa [[TBAA4:![0-9]+]]
+; CHECK-NEXT: [[TMP29:%.*]] = extractelement <4 x i32> [[TMP26]], i32 1
; CHECK-NEXT: store i32 [[TMP29]], i32* undef, align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x i32> [[TMP23]], i32 2
+; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x i32> [[TMP26]], i32 2
; CHECK-NEXT: store i32 [[TMP30]], i32* undef, align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x i32> [[TMP23]], i32 3
+; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x i32> [[TMP26]], i32 3
; CHECK-NEXT: store i32 [[TMP31]], i32* undef, align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x i32> [[TMP27]], i32 0
+; CHECK-NEXT: store i32 [[TMP32]], i32* undef, align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT: [[TMP33:%.*]] = extractelement <4 x i32> [[TMP27]], i32 1
+; CHECK-NEXT: store i32 [[TMP33]], i32* undef, align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i32> [[TMP27]], i32 2
+; CHECK-NEXT: store i32 [[TMP34]], i32* undef, align 4, !tbaa [[TBAA4]]
+; CHECK-NEXT: [[TMP35:%.*]] = extractelement <4 x i32> [[TMP27]], i32 3
+; CHECK-NEXT: store i32 [[TMP35]], i32* undef, align 4, !tbaa [[TBAA4]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[SW_EPILOG:%.*]], label [[SCALAR_PH]]
; CHECK-NEXT: [[P_359:%.*]] = phi i8* [ [[ADD_PTR86:%.*]], [[FOR_BODY68]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
; CHECK-NEXT: [[CONV70:%.*]] = zext i8 [[X]] to i32
; CHECK-NEXT: [[SHL71:%.*]] = shl nuw i32 [[CONV70]], 24
-; CHECK-NEXT: [[TMP33:%.*]] = load i8, i8* [[P]], align 1, !tbaa [[TBAA1]]
-; CHECK-NEXT: [[CONV73:%.*]] = zext i8 [[TMP33]] to i32
+; CHECK-NEXT: [[TMP37:%.*]] = load i8, i8* [[P]], align 1, !tbaa [[TBAA1]]
+; CHECK-NEXT: [[CONV73:%.*]] = zext i8 [[TMP37]] to i32
; CHECK-NEXT: [[SHL74:%.*]] = shl nuw nsw i32 [[CONV73]], 16
; CHECK-NEXT: [[OR75:%.*]] = or i32 [[SHL74]], [[SHL71]]
-; CHECK-NEXT: [[TMP34:%.*]] = load i8, i8* undef, align 1, !tbaa [[TBAA1]]
+; CHECK-NEXT: [[TMP38:%.*]] = load i8, i8* undef, align 1, !tbaa [[TBAA1]]
; CHECK-NEXT: [[SHL78:%.*]] = shl nuw nsw i32 undef, 8
; CHECK-NEXT: [[OR79:%.*]] = or i32 [[OR75]], [[SHL78]]
-; CHECK-NEXT: [[CONV81:%.*]] = zext i8 [[TMP34]] to i32
+; CHECK-NEXT: [[CONV81:%.*]] = zext i8 [[TMP38]] to i32
; CHECK-NEXT: [[OR83:%.*]] = or i32 [[OR79]], [[CONV81]]
; CHECK-NEXT: store i32 [[OR83]], i32* undef, align 4, !tbaa [[TBAA4]]
; CHECK-NEXT: [[ADD_PTR86]] = getelementptr inbounds i8, i8* [[P_359]], i64 4
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S | FileCheck %s
; Make sure that integer poison-generating flags (i.e., nuw/nsw, exact and inbounds)
; Drop poison-generating flags from 'sub' and 'getelementptr' feeding a masked load.
; Test for PR52111.
define void @drop_scalar_nuw_nsw(float* noalias nocapture readonly %input,
+ float* %output) local_unnamed_addr #0 {
; CHECK-LABEL: @drop_scalar_nuw_nsw(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, {{.*}} ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, {{.*}} ]
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = icmp eq <4 x i64> [[VEC_IND]], zeroinitializer
-; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[TMP0]], 1
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr float, float* [[INPUT:%.*]], i64 [[TMP2]]
-; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i1> [[TMP1]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr float, float* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP5]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP6]], i32 4, <4 x i1> [[TMP4]], <4 x float> poison), !invariant.load !0
-; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x float> [[WIDE_MASKED_LOAD]], <4 x float> zeroinitializer
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[OUTPUT:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP7]], i32 0
+; CHECK: [[TMP4:%.*]] = icmp eq <4 x i64> [[VEC_IND]], zeroinitializer
+; CHECK-NEXT: [[TMP5:%.*]] = sub i64 [[TMP0]], 1
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr float, float* [[INPUT:%.*]], i64 [[TMP5]]
+; CHECK-NEXT: [[TMP7:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr float, float* [[TMP6]], i32 0
; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[PREDPHI]], <4 x float>* [[TMP9]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4
-; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4, 4
-; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
-; CHECK: loop.header:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_INC:%.*]], [[IF_END:%.*]] ]
-; CHECK-NEXT: [[I23:%.*]] = icmp eq i64 [[IV]], 0
-; CHECK-NEXT: br i1 [[I23]], label [[IF_END]], label [[IF_THEN:%.*]]
-; CHECK: if.then:
-; CHECK-NEXT: [[I27:%.*]] = sub nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[I29:%.*]] = getelementptr inbounds float, float* [[INPUT]], i64 [[I27]]
-; CHECK-NEXT: [[I30:%.*]] = load float, float* [[I29]], align 4, !invariant.load !0
-; CHECK-NEXT: br label [[IF_END]]
-; CHECK: if.end:
-; CHECK-NEXT: [[I34:%.*]] = phi float [ 0.000000e+00, [[LOOP_HEADER]] ], [ [[I30]], [[IF_THEN]] ]
-; CHECK-NEXT: [[I35:%.*]] = getelementptr inbounds float, float* [[OUTPUT]], i64 [[IV]]
-; CHECK-NEXT: store float [[I34]], float* [[I35]], align 4
-; CHECK-NEXT: [[IV_INC]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_INC]], 4
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP_EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK: loop.exit:
-; CHECK-NEXT: ret void
-;
- float* %output) local_unnamed_addr #0 {
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP9]], i32 4, <4 x i1> [[TMP7]], <4 x float> poison), !invariant.load !0
entry:
br label %loop.header
; Drop poison-generating flags from 'sub' and 'getelementptr' feeding a masked load.
; In this case, 'sub' and 'getelementptr' are not guarded by the predicate.
define void @drop_nonpred_scalar_nuw_nsw(float* noalias nocapture readonly %input,
+ float* %output) local_unnamed_addr #0 {
; CHECK-LABEL: @drop_nonpred_scalar_nuw_nsw(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, {{.*}} ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, {{.*}} ]
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], 1
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, float* [[INPUT:%.*]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i64> [[VEC_IND]], zeroinitializer
-; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i1> [[TMP3]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr float, float* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[TMP5]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP6]], i32 4, <4 x i1> [[TMP4]], <4 x float> poison), !invariant.load !0
-; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP4]], <4 x float> [[WIDE_MASKED_LOAD]], <4 x float> zeroinitializer
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[OUTPUT:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP7]], i32 0
+; CHECK: [[TMP5:%.*]] = sub i64 [[TMP0]], 1
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr float, float* [[INPUT:%.*]], i64 [[TMP5]]
+; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <4 x i64> [[VEC_IND]], zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr float, float* [[TMP6]], i32 0
; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[PREDPHI]], <4 x float>* [[TMP9]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4
-; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4, 4
-; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
-; CHECK: loop.header:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_INC:%.*]], [[IF_END:%.*]] ]
-; CHECK-NEXT: [[I27:%.*]] = sub i64 [[IV]], 1
-; CHECK-NEXT: [[I29:%.*]] = getelementptr float, float* [[INPUT]], i64 [[I27]]
-; CHECK-NEXT: [[I23:%.*]] = icmp eq i64 [[IV]], 0
-; CHECK-NEXT: br i1 [[I23]], label [[IF_END]], label [[IF_THEN:%.*]]
-; CHECK: if.then:
-; CHECK-NEXT: [[I30:%.*]] = load float, float* [[I29]], align 4, !invariant.load !0
-; CHECK-NEXT: br label [[IF_END]]
-; CHECK: if.end:
-; CHECK-NEXT: [[I34:%.*]] = phi float [ 0.000000e+00, [[LOOP_HEADER]] ], [ [[I30]], [[IF_THEN]] ]
-; CHECK-NEXT: [[I35:%.*]] = getelementptr inbounds float, float* [[OUTPUT]], i64 [[IV]]
-; CHECK-NEXT: store float [[I34]], float* [[I35]], align 4
-; CHECK-NEXT: [[IV_INC]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_INC]], 4
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP_EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK: loop.exit:
-; CHECK-NEXT: ret void
-;
- float* %output) local_unnamed_addr #0 {
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP9]], i32 4, <4 x i1> [[TMP7]], <4 x float> poison), !invariant.load !0
entry:
br label %loop.header
; Preserve poison-generating flags from vector 'sub', 'mul' and 'getelementptr' feeding a masked gather.
define void @preserve_vector_nuw_nsw(float* noalias nocapture readonly %input,
+ float* %output) local_unnamed_addr #0 {
; CHECK-LABEL: @preserve_vector_nuw_nsw(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, {{.*}} ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, {{.*}} ]
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = icmp eq <4 x i64> [[VEC_IND]], zeroinitializer
-; CHECK-NEXT: [[TMP2:%.*]] = sub nuw nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
-; CHECK-NEXT: [[TMP3:%.*]] = mul nuw nsw <4 x i64> [[TMP2]], <i64 2, i64 2, i64 2, i64 2>
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[INPUT:%.*]], <4 x i64> [[TMP3]]
-; CHECK-NEXT: [[TMP5:%.*]] = xor <4 x i1> [[TMP1]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP4]], i32 4, <4 x i1> [[TMP5]], <4 x float> undef), !invariant.load !0
-; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP5]], <4 x float> [[WIDE_MASKED_GATHER]], <4 x float> zeroinitializer
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[OUTPUT:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 0
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP7]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[PREDPHI]], <4 x float>* [[TMP8]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4
-; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4, 4
-; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
-; CHECK: loop.header:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_INC:%.*]], [[IF_END:%.*]] ]
-; CHECK-NEXT: [[I23:%.*]] = icmp eq i64 [[IV]], 0
-; CHECK-NEXT: br i1 [[I23]], label [[IF_END]], label [[IF_THEN:%.*]]
-; CHECK: if.then:
-; CHECK-NEXT: [[I27:%.*]] = sub nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[I28:%.*]] = mul nuw nsw i64 [[I27]], 2
-; CHECK-NEXT: [[I29:%.*]] = getelementptr inbounds float, float* [[INPUT]], i64 [[I28]]
-; CHECK-NEXT: [[I30:%.*]] = load float, float* [[I29]], align 4, !invariant.load !0
-; CHECK-NEXT: br label [[IF_END]]
-; CHECK: if.end:
-; CHECK-NEXT: [[I34:%.*]] = phi float [ 0.000000e+00, [[LOOP_HEADER]] ], [ [[I30]], [[IF_THEN]] ]
-; CHECK-NEXT: [[I35:%.*]] = getelementptr inbounds float, float* [[OUTPUT]], i64 [[IV]]
-; CHECK-NEXT: store float [[I34]], float* [[I35]], align 4
-; CHECK-NEXT: [[IV_INC]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_INC]], 4
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP_EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK: loop.exit:
-; CHECK-NEXT: ret void
-;
- float* %output) local_unnamed_addr #0 {
+; CHECK: [[TMP4:%.*]] = icmp eq <4 x i64> [[VEC_IND]], zeroinitializer
+; CHECK-NEXT: [[TMP5:%.*]] = sub nuw nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT: [[TMP6:%.*]] = mul nuw nsw <4 x i64> [[TMP5]], <i64 2, i64 2, i64 2, i64 2>
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[INPUT:%.*]], <4 x i64> [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP7]], i32 4, <4 x i1> [[TMP8]], <4 x float> undef), !invariant.load !0
entry:
br label %loop.header
; Drop poison-generating flags from vector 'sub' and 'gep' feeding a masked load.
define void @drop_vector_nuw_nsw(float* noalias nocapture readonly %input,
+ float* %output, float** noalias %ptrs) local_unnamed_addr #0 {
; CHECK-LABEL: @drop_vector_nuw_nsw(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, {{.*}} ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, {{.*}} ]
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = icmp eq <4 x i64> [[VEC_IND]], zeroinitializer
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float*, float** [[PTRS:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = sub <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr float, float* [[INPUT:%.*]], <4 x i64> [[TMP3]]
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float*, float** [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast float** [[TMP5]] to <4 x float*>*
-; CHECK-NEXT: store <4 x float*> [[TMP4]], <4 x float*>* [[TMP6]], align 8
-; CHECK-NEXT: [[TMP7:%.*]] = xor <4 x i1> [[TMP1]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x float*> [[TMP4]], i32 0
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr float, float* [[TMP8]], i32 0
-; CHECK-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP10]], i32 4, <4 x i1> [[TMP7]], <4 x float> poison), !invariant.load !0
-; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP7]], <4 x float> [[WIDE_MASKED_LOAD]], <4 x float> zeroinitializer
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[OUTPUT:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP11]], i32 0
+; CHECK: [[TMP4:%.*]] = icmp eq <4 x i64> [[VEC_IND]], zeroinitializer
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float*, float** [[PTRS:%.*]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP6:%.*]] = sub <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr float, float* [[INPUT:%.*]], <4 x i64> [[TMP6]]
+; CHECK: [[TMP10:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float*> [[TMP7]], i32 0
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr float, float* [[TMP11]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[PREDPHI]], <4 x float>* [[TMP13]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4
-; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4, 4
-; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
-; CHECK: loop.header:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_INC:%.*]], [[IF_END:%.*]] ]
-; CHECK-NEXT: [[I23:%.*]] = icmp eq i64 [[IV]], 0
-; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds float*, float** [[PTRS]], i64 [[IV]]
-; CHECK-NEXT: [[I27:%.*]] = sub nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[I29:%.*]] = getelementptr inbounds float, float* [[INPUT]], i64 [[I27]]
-; CHECK-NEXT: store float* [[I29]], float** [[GEP]], align 8
-; CHECK-NEXT: br i1 [[I23]], label [[IF_END]], label [[IF_THEN:%.*]]
-; CHECK: if.then:
-; CHECK-NEXT: [[I30:%.*]] = load float, float* [[I29]], align 4, !invariant.load !0
-; CHECK-NEXT: br label [[IF_END]]
-; CHECK: if.end:
-; CHECK-NEXT: [[I34:%.*]] = phi float [ 0.000000e+00, [[LOOP_HEADER]] ], [ [[I30]], [[IF_THEN]] ]
-; CHECK-NEXT: [[I35:%.*]] = getelementptr inbounds float, float* [[OUTPUT]], i64 [[IV]]
-; CHECK-NEXT: store float [[I34]], float* [[I35]], align 4
-; CHECK-NEXT: [[IV_INC]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_INC]], 4
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP_EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK: loop.exit:
-; CHECK-NEXT: ret void
-;
- float* %output, float** noalias %ptrs) local_unnamed_addr #0 {
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP13]], i32 4, <4 x i1> [[TMP10]], <4 x float> poison), !invariant.load !0
entry:
br label %loop.header
; of any masked load/store/gather/scatter.
define void @preserve_nuw_nsw_no_addr(i64* %output) local_unnamed_addr #0 {
; CHECK-LABEL: @preserve_nuw_nsw_no_addr(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, {{.*}} ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, {{.*}} ]
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = icmp eq <4 x i64> [[VEC_IND]], zeroinitializer
-; CHECK-NEXT: [[TMP2:%.*]] = sub nuw nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
-; CHECK-NEXT: [[TMP3:%.*]] = xor <4 x i1> [[TMP1]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP3]], <4 x i64> [[TMP2]], <4 x i64> zeroinitializer
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, i64* [[OUTPUT:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, i64* [[TMP4]], i32 0
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64* [[TMP5]] to <4 x i64>*
-; CHECK-NEXT: store <4 x i64> [[PREDPHI]], <4 x i64>* [[TMP6]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4
-; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4, 4
-; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
-; CHECK: loop.header:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_INC:%.*]], [[IF_END:%.*]] ]
-; CHECK-NEXT: [[I23:%.*]] = icmp eq i64 [[IV]], 0
-; CHECK-NEXT: br i1 [[I23]], label [[IF_END]], label [[IF_THEN:%.*]]
-; CHECK: if.then:
-; CHECK-NEXT: [[I27:%.*]] = sub nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: br label [[IF_END]]
-; CHECK: if.end:
-; CHECK-NEXT: [[I34:%.*]] = phi i64 [ 0, [[LOOP_HEADER]] ], [ [[I27]], [[IF_THEN]] ]
-; CHECK-NEXT: [[I35:%.*]] = getelementptr inbounds i64, i64* [[OUTPUT]], i64 [[IV]]
-; CHECK-NEXT: store i64 [[I34]], i64* [[I35]], align 4
-; CHECK-NEXT: [[IV_INC]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_INC]], 4
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP_EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK: loop.exit:
-; CHECK-NEXT: ret void
-;
+; CHECK: [[TMP4:%.*]] = icmp eq <4 x i64> [[VEC_IND]], zeroinitializer
+; CHECK-NEXT: [[TMP5:%.*]] = sub nuw nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[TMP5]], <4 x i64> zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, i64* [[OUTPUT:%.*]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, i64* [[TMP7]], i32 0
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64* [[TMP8]] to <4 x i64>*
+; CHECK-NEXT: store <4 x i64> [[PREDPHI]], <4 x i64>* [[TMP9]], align 4
entry:
br label %loop.header
; Drop poison-generating flags from 'sdiv' and 'getelementptr' feeding a masked load.
define void @drop_scalar_exact(float* noalias nocapture readonly %input,
+ float* %output) local_unnamed_addr #0 {
; CHECK-LABEL: @drop_scalar_exact(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, {{.*}} ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, {{.*}} ]
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i64> [[VEC_IND]], zeroinitializer
-; CHECK-NEXT: [[TMP2:%.*]] = and <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
-; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i64> [[TMP2]], zeroinitializer
-; CHECK-NEXT: [[TMP4:%.*]] = and <4 x i1> [[TMP1]], [[TMP3]]
-; CHECK-NEXT: [[TMP5:%.*]] = sdiv i64 [[TMP0]], 1
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr float, float* [[INPUT:%.*]], i64 [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr float, float* [[TMP6]], i32 0
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP9]], i32 4, <4 x i1> [[TMP7]], <4 x float> poison), !invariant.load !0
-; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP7]], <4 x float> [[WIDE_MASKED_LOAD]], <4 x float> zeroinitializer
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[OUTPUT:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
+; CHECK: [[TMP4:%.*]] = icmp ne <4 x i64> [[VEC_IND]], zeroinitializer
+; CHECK-NEXT: [[TMP5:%.*]] = and <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <4 x i64> [[TMP5]], zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = and <4 x i1> [[TMP4]], [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = sdiv i64 [[TMP0]], 1
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr float, float* [[INPUT:%.*]], i64 [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i1> [[TMP7]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr float, float* [[TMP9]], i32 0
; CHECK-NEXT: [[TMP12:%.*]] = bitcast float* [[TMP11]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[PREDPHI]], <4 x float>* [[TMP12]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4
-; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4, 4
-; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
-; CHECK: loop.header:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_INC:%.*]], [[IF_END:%.*]] ]
-; CHECK-NEXT: [[I7:%.*]] = icmp ne i64 [[IV]], 0
-; CHECK-NEXT: [[I8:%.*]] = and i64 [[IV]], 1
-; CHECK-NEXT: [[I9:%.*]] = icmp eq i64 [[I8]], 0
-; CHECK-NEXT: [[I10:%.*]] = and i1 [[I7]], [[I9]]
-; CHECK-NEXT: br i1 [[I10]], label [[IF_END]], label [[IF_THEN:%.*]]
-; CHECK: if.then:
-; CHECK-NEXT: [[I26:%.*]] = sdiv exact i64 [[IV]], 1
-; CHECK-NEXT: [[I29:%.*]] = getelementptr inbounds float, float* [[INPUT]], i64 [[I26]]
-; CHECK-NEXT: [[I30:%.*]] = load float, float* [[I29]], align 4, !invariant.load !0
-; CHECK-NEXT: br label [[IF_END]]
-; CHECK: if.end:
-; CHECK-NEXT: [[I34:%.*]] = phi float [ 0.000000e+00, [[LOOP_HEADER]] ], [ [[I30]], [[IF_THEN]] ]
-; CHECK-NEXT: [[I35:%.*]] = getelementptr inbounds float, float* [[OUTPUT]], i64 [[IV]]
-; CHECK-NEXT: store float [[I34]], float* [[I35]], align 4
-; CHECK-NEXT: [[IV_INC]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_INC]], 4
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP_EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP14:![0-9]+]]
-; CHECK: loop.exit:
-; CHECK-NEXT: ret void
-;
- float* %output) local_unnamed_addr #0 {
+; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP12]], i32 4, <4 x i1> [[TMP10]], <4 x float> poison), !invariant.load !0
entry:
br label %loop.header
; Preserve poison-generating flags from 'sdiv' and 'getelementptr' feeding a masked gather.
define void @preserve_vector_exact_no_addr(float* noalias nocapture readonly %input,
+ float* %output) local_unnamed_addr #0 {
; CHECK-LABEL: @preserve_vector_exact_no_addr(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, {{.*}} ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, {{.*}} ]
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <4 x i64> [[VEC_IND]], zeroinitializer
-; CHECK-NEXT: [[TMP2:%.*]] = and <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
-; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i64> [[TMP2]], zeroinitializer
-; CHECK-NEXT: [[TMP4:%.*]] = and <4 x i1> [[TMP1]], [[TMP3]]
-; CHECK-NEXT: [[TMP5:%.*]] = sdiv exact <4 x i64> [[VEC_IND]], <i64 2, i64 2, i64 2, i64 2>
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[INPUT:%.*]], <4 x i64> [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP6]], i32 4, <4 x i1> [[TMP7]], <4 x float> undef), !invariant.load !0
-; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP7]], <4 x float> [[WIDE_MASKED_GATHER]], <4 x float> zeroinitializer
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[OUTPUT:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP8]], i32 0
-; CHECK-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[PREDPHI]], <4 x float>* [[TMP10]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4
-; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4, 4
-; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
-; CHECK: loop.header:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_INC:%.*]], [[IF_END:%.*]] ]
-; CHECK-NEXT: [[I7:%.*]] = icmp ne i64 [[IV]], 0
-; CHECK-NEXT: [[I8:%.*]] = and i64 [[IV]], 1
-; CHECK-NEXT: [[I9:%.*]] = icmp eq i64 [[I8]], 0
-; CHECK-NEXT: [[I10:%.*]] = and i1 [[I7]], [[I9]]
-; CHECK-NEXT: br i1 [[I10]], label [[IF_END]], label [[IF_THEN:%.*]]
-; CHECK: if.then:
-; CHECK-NEXT: [[I26:%.*]] = sdiv exact i64 [[IV]], 2
-; CHECK-NEXT: [[I29:%.*]] = getelementptr inbounds float, float* [[INPUT]], i64 [[I26]]
-; CHECK-NEXT: [[I30:%.*]] = load float, float* [[I29]], align 4, !invariant.load !0
-; CHECK-NEXT: br label [[IF_END]]
-; CHECK: if.end:
-; CHECK-NEXT: [[I34:%.*]] = phi float [ 0.000000e+00, [[LOOP_HEADER]] ], [ [[I30]], [[IF_THEN]] ]
-; CHECK-NEXT: [[I35:%.*]] = getelementptr inbounds float, float* [[OUTPUT]], i64 [[IV]]
-; CHECK-NEXT: store float [[I34]], float* [[I35]], align 4
-; CHECK-NEXT: [[IV_INC]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_INC]], 4
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP_EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP16:![0-9]+]]
-; CHECK: loop.exit:
-; CHECK-NEXT: ret void
+; CHECK: [[TMP4:%.*]] = icmp ne <4 x i64> [[VEC_IND]], zeroinitializer
+; CHECK-NEXT: [[TMP5:%.*]] = and <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <4 x i64> [[TMP5]], zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = and <4 x i1> [[TMP4]], [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = sdiv exact <4 x i64> [[VEC_IND]], <i64 2, i64 2, i64 2, i64 2>
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[INPUT:%.*]], <4 x i64> [[TMP8]]
+; CHECK-NEXT: [[TMP10:%.*]] = xor <4 x i1> [[TMP7]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> [[TMP9]], i32 4, <4 x i1> [[TMP10]], <4 x float> undef), !invariant.load !0
;
- float* %output) local_unnamed_addr #0 {
entry:
br label %loop.header
; of any masked load/store/gather/scatter.
define void @preserve_exact_no_addr(i64* %output) local_unnamed_addr #0 {
; CHECK-LABEL: @preserve_exact_no_addr(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, {{.*}} ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, {{.*}} ]
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = icmp eq <4 x i64> [[VEC_IND]], zeroinitializer
-; CHECK-NEXT: [[TMP2:%.*]] = sdiv exact <4 x i64> [[VEC_IND]], <i64 2, i64 2, i64 2, i64 2>
-; CHECK-NEXT: [[TMP3:%.*]] = xor <4 x i1> [[TMP1]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP3]], <4 x i64> [[TMP2]], <4 x i64> zeroinitializer
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, i64* [[OUTPUT:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, i64* [[TMP4]], i32 0
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64* [[TMP5]] to <4 x i64>*
-; CHECK-NEXT: store <4 x i64> [[PREDPHI]], <4 x i64>* [[TMP6]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4
-; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4, 4
-; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
-; CHECK: loop.header:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_INC:%.*]], [[IF_END:%.*]] ]
-; CHECK-NEXT: [[I23:%.*]] = icmp eq i64 [[IV]], 0
-; CHECK-NEXT: br i1 [[I23]], label [[IF_END]], label [[IF_THEN:%.*]]
-; CHECK: if.then:
-; CHECK-NEXT: [[I27:%.*]] = sdiv exact i64 [[IV]], 2
-; CHECK-NEXT: br label [[IF_END]]
-; CHECK: if.end:
-; CHECK-NEXT: [[I34:%.*]] = phi i64 [ 0, [[LOOP_HEADER]] ], [ [[I27]], [[IF_THEN]] ]
-; CHECK-NEXT: [[I35:%.*]] = getelementptr inbounds i64, i64* [[OUTPUT]], i64 [[IV]]
-; CHECK-NEXT: store i64 [[I34]], i64* [[I35]], align 4
-; CHECK-NEXT: [[IV_INC]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_INC]], 4
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP_EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP18:![0-9]+]]
-; CHECK: loop.exit:
-; CHECK-NEXT: ret void
-;
+; CHECK: [[TMP4:%.*]] = icmp eq <4 x i64> [[VEC_IND]], zeroinitializer
+; CHECK-NEXT: [[TMP5:%.*]] = sdiv exact <4 x i64> [[VEC_IND]], <i64 2, i64 2, i64 2, i64 2>
+; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[TMP5]], <4 x i64> zeroinitializer
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, i64* [[OUTPUT:%.*]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, i64* [[TMP7]], i32 0
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64* [[TMP8]] to <4 x i64>*
+; CHECK-NEXT: store <4 x i64> [[PREDPHI]], <4 x i64>* [[TMP9]], align 4
entry:
br label %loop.header
; CHECK: Found an estimated cost of 1 for VF 2 For instruction: %neg = fneg float %{{.*}}
; CHECK: Found an estimated cost of 1 for VF 4 For instruction: %neg = fneg float %{{.*}}
define void @fneg_cost(float* %a, i64 %n) {
-; CHECK-LABEL: @fneg_cost(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 4
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
-; CHECK-NEXT: [[TMP8:%.*]] = fneg <4 x float> [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP9:%.*]] = fneg <4 x float> [[WIDE_LOAD1]]
-; CHECK-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP8]], <4 x float>* [[TMP10]], align 4
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP9]], <4 x float>* [[TMP11]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP13:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[NEG:%.*]] = fneg float [[TMP13]]
-; CHECK-NEXT: store float [[NEG]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
entry:
br label %for.body
for.body: ; preds = %for.body.preheader, %for.body
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -S -debug-only=loop-vectorize 2>&1 | FileCheck %s
; REQUIRES: asserts
; CHECK: cost of 1 for VF 1 For instruction: %conv = fptosi float %tmp to i8
define void @float_to_sint8_cost(i8* noalias nocapture %a, float* noalias nocapture readonly %b) nounwind {
-; CHECK-LABEL: @float_to_sint8_cost(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8
-; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 16
-; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 24
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP2]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP3]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <8 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP9]], align 4
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 8
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to <8 x float>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, <8 x float>* [[TMP11]], align 4
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 16
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <8 x float>*
-; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x float>, <8 x float>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 24
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <8 x float>*
-; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x float>, <8 x float>* [[TMP15]], align 4
-; CHECK-NEXT: [[TMP16:%.*]] = fptosi <8 x float> [[WIDE_LOAD]] to <8 x i8>
-; CHECK-NEXT: [[TMP17:%.*]] = fptosi <8 x float> [[WIDE_LOAD1]] to <8 x i8>
-; CHECK-NEXT: [[TMP18:%.*]] = fptosi <8 x float> [[WIDE_LOAD2]] to <8 x i8>
-; CHECK-NEXT: [[TMP19:%.*]] = fptosi <8 x float> [[WIDE_LOAD3]] to <8 x i8>
-; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[TMP2]]
-; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[TMP3]]
-; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, i8* [[TMP20]], i32 0
-; CHECK-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP24]] to <8 x i8>*
-; CHECK-NEXT: store <8 x i8> [[TMP16]], <8 x i8>* [[TMP25]], align 4
-; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, i8* [[TMP20]], i32 8
-; CHECK-NEXT: [[TMP27:%.*]] = bitcast i8* [[TMP26]] to <8 x i8>*
-; CHECK-NEXT: store <8 x i8> [[TMP17]], <8 x i8>* [[TMP27]], align 4
-; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, i8* [[TMP20]], i32 16
-; CHECK-NEXT: [[TMP29:%.*]] = bitcast i8* [[TMP28]] to <8 x i8>*
-; CHECK-NEXT: store <8 x i8> [[TMP18]], <8 x i8>* [[TMP29]], align 4
-; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, i8* [[TMP20]], i32 24
-; CHECK-NEXT: [[TMP31:%.*]] = bitcast i8* [[TMP30]] to <8 x i8>*
-; CHECK-NEXT: store <8 x i8> [[TMP19]], <8 x i8>* [[TMP31]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; CHECK-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; CHECK-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 256, 256
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[CONV:%.*]] = fptosi float [[TMP]] to i8
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store i8 [[CONV]], i8* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 256
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
entry:
br label %for.body
for.body:
; AVX512: vector.body:
; AVX512-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; AVX512-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]]
-; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
-; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
-; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1
-; AVX512-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef)
-; AVX512-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER2]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
-; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], <16 x i64> [[VEC_IND]]
-; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP3]], <16 x float*> [[TMP4]], i32 4, <16 x i1> [[TMP1]])
+; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]]
+; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP16]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT: [[TMP17:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
+; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1
+; AVX512-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP18]], i32 4, <16 x i1> [[TMP17]], <16 x float> undef)
+; AVX512-NEXT: [[TMP19:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER2]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], <16 x i64> [[VEC_IND]]
+; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP19]], <16 x float*> [[TMP20]], i32 4, <16 x i1> [[TMP17]])
; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 16
; AVX512-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], <i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256>
-; AVX512-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; AVX512-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; AVX512-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; AVX512-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; AVX512: middle.block:
; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 256, 256
; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]]
; AVX512: for.body:
; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 4096, [[MIDDLE_BLOCK]] ]
; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
-; AVX512-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; AVX512-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP6]], 0
+; AVX512-NEXT: [[TMP22:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX512-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP22]], 0
; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
; AVX512: if.then:
; AVX512-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[INDVARS_IV]], i32 1
-; AVX512-NEXT: [[TMP7:%.*]] = load float, float* [[B]], align 4
-; AVX512-NEXT: [[ADD:%.*]] = fadd float [[TMP7]], 5.000000e-01
+; AVX512-NEXT: [[TMP23:%.*]] = load float, float* [[B]], align 4
+; AVX512-NEXT: [[ADD:%.*]] = fadd float [[TMP23]], 5.000000e-01
; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDVARS_IV]]
; AVX512-NEXT: store float [[ADD]], float* [[ARRAYIDX5]], align 4
; AVX512-NEXT: br label [[FOR_INC]]
; AVX512: vector.body:
; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; AVX512-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]]
-; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
-; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
-; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1
-; AVX512-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef)
-; AVX512-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER1]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
-; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_OUT:%.*]], %struct.Out* [[OUT:%.*]], <16 x i64> [[VEC_IND]], i32 1
-; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP3]], <16 x float*> [[TMP4]], i32 4, <16 x i1> [[TMP1]])
+; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]]
+; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP16]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT: [[TMP17:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
+; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1
+; AVX512-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP18]], i32 4, <16 x i1> [[TMP17]], <16 x float> undef)
+; AVX512-NEXT: [[TMP19:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER1]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_OUT:%.*]], %struct.Out* [[OUT:%.*]], <16 x i64> [[VEC_IND]], i32 1
+; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP19]], <16 x float*> [[TMP20]], i32 4, <16 x i1> [[TMP17]])
; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; AVX512-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], <i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256>
-; AVX512-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; AVX512-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; AVX512-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; AVX512-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; AVX512: middle.block:
; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 256, 256
; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]]
; AVX512: for.body:
; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 4096, [[MIDDLE_BLOCK]] ]
; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
-; AVX512-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; AVX512-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP6]], 0
+; AVX512-NEXT: [[TMP22:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX512-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP22]], 0
; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
; AVX512: if.then:
; AVX512-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[INDVARS_IV]], i32 1
-; AVX512-NEXT: [[TMP7:%.*]] = load float, float* [[B]], align 4
-; AVX512-NEXT: [[ADD:%.*]] = fadd float [[TMP7]], 5.000000e-01
+; AVX512-NEXT: [[TMP23:%.*]] = load float, float* [[B]], align 4
+; AVX512-NEXT: [[ADD:%.*]] = fadd float [[TMP23]], 5.000000e-01
; AVX512-NEXT: [[B6:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], i64 [[INDVARS_IV]], i32 1
; AVX512-NEXT: store float [[ADD]], float* [[B6]], align 4
; AVX512-NEXT: br label [[FOR_INC]]
; AVX512: vector.body:
; AVX512-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; AVX512-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]]
-; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
-; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
-; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1
-; AVX512-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef)
-; AVX512-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER2]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
-; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], <16 x i64> [[VEC_IND]]
-; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP3]], <16 x float addrspace(1)*> [[TMP4]], i32 4, <16 x i1> [[TMP1]])
+; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]]
+; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP16]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT: [[TMP17:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
+; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1
+; AVX512-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP18]], i32 4, <16 x i1> [[TMP17]], <16 x float> undef)
+; AVX512-NEXT: [[TMP19:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER2]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], <16 x i64> [[VEC_IND]]
+; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP19]], <16 x float addrspace(1)*> [[TMP20]], i32 4, <16 x i1> [[TMP17]])
; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 16
; AVX512-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], <i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256>
-; AVX512-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; AVX512-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; AVX512-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; AVX512-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; AVX512: middle.block:
; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 256, 256
; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]]
; AVX512: for.body:
; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 4096, [[MIDDLE_BLOCK]] ]
; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
-; AVX512-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; AVX512-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP6]], 0
+; AVX512-NEXT: [[TMP22:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX512-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP22]], 0
; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
; AVX512: if.then:
; AVX512-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], i64 [[INDVARS_IV]], i32 1
-; AVX512-NEXT: [[TMP7:%.*]] = load float, float addrspace(1)* [[B]], align 4
-; AVX512-NEXT: [[ADD:%.*]] = fadd float [[TMP7]], 5.000000e-01
+; AVX512-NEXT: [[TMP23:%.*]] = load float, float addrspace(1)* [[B]], align 4
+; AVX512-NEXT: [[ADD:%.*]] = fadd float [[TMP23]], 5.000000e-01
; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[INDVARS_IV]]
; AVX512-NEXT: store float [[ADD]], float addrspace(1)* [[ARRAYIDX5]], align 4
; AVX512-NEXT: br label [[FOR_INC]]
; AVX512: vector.body:
; AVX512-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; AVX512-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]]
-; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
-; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
-; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1
-; AVX512-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef)
-; AVX512-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER2]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
-; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], <16 x i64> [[VEC_IND]]
-; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP3]], <16 x float*> [[TMP4]], i32 4, <16 x i1> [[TMP1]])
+; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]]
+; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP16]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT: [[TMP17:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
+; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1
+; AVX512-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP18]], i32 4, <16 x i1> [[TMP17]], <16 x float> undef)
+; AVX512-NEXT: [[TMP19:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER2]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], <16 x i64> [[VEC_IND]]
+; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP19]], <16 x float*> [[TMP20]], i32 4, <16 x i1> [[TMP17]])
; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 16
; AVX512-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], <i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256>
-; AVX512-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; AVX512-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; AVX512-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; AVX512-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; AVX512: middle.block:
; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 256, 256
; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]]
; AVX512: for.body:
; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 4096, [[MIDDLE_BLOCK]] ]
; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
-; AVX512-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; AVX512-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP6]], 0
+; AVX512-NEXT: [[TMP22:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX512-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP22]], 0
; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
; AVX512: if.then:
; AVX512-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], i64 [[INDVARS_IV]], i32 1
-; AVX512-NEXT: [[TMP7:%.*]] = load float, float addrspace(1)* [[B]], align 4
-; AVX512-NEXT: [[ADD:%.*]] = fadd float [[TMP7]], 5.000000e-01
+; AVX512-NEXT: [[TMP23:%.*]] = load float, float addrspace(1)* [[B]], align 4
+; AVX512-NEXT: [[ADD:%.*]] = fadd float [[TMP23]], 5.000000e-01
; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDVARS_IV]]
; AVX512-NEXT: store float [[ADD]], float* [[ARRAYIDX5]], align 4
; AVX512-NEXT: br label [[FOR_INC]]
; AVX512: vector.body:
; AVX512-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; AVX512-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 16, i64 32, i64 48, i64 64, i64 80, i64 96, i64 112, i64 128, i64 144, i64 160, i64 176, i64 192, i64 208, i64 224, i64 240>, [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]]
-; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
-; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
-; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1
-; AVX512-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef)
-; AVX512-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER2]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
-; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], <16 x i64> [[VEC_IND]]
-; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP3]], <16 x float addrspace(1)*> [[TMP4]], i32 4, <16 x i1> [[TMP1]])
+; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> [[VEC_IND]]
+; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP16]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
+; AVX512-NEXT: [[TMP17:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer
+; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <16 x i64> [[VEC_IND]], i32 1
+; AVX512-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP18]], i32 4, <16 x i1> [[TMP17]], <16 x float> undef)
+; AVX512-NEXT: [[TMP19:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER2]], <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
+; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], <16 x i64> [[VEC_IND]]
+; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP19]], <16 x float addrspace(1)*> [[TMP20]], i32 4, <16 x i1> [[TMP17]])
; AVX512-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 16
; AVX512-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], <i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256, i64 256>
-; AVX512-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; AVX512-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; AVX512-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; AVX512-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; AVX512: middle.block:
; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 256, 256
; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]]
; AVX512: for.body:
; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 4096, [[MIDDLE_BLOCK]] ]
; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]]
-; AVX512-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; AVX512-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP6]], 0
+; AVX512-NEXT: [[TMP22:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AVX512-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP22]], 0
; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
; AVX512: if.then:
; AVX512-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], i64 [[INDVARS_IV]], i32 1
-; AVX512-NEXT: [[TMP7:%.*]] = load float, float* [[B]], align 4
-; AVX512-NEXT: [[ADD:%.*]] = fadd float [[TMP7]], 5.000000e-01
+; AVX512-NEXT: [[TMP23:%.*]] = load float, float* [[B]], align 4
+; AVX512-NEXT: [[ADD:%.*]] = fadd float [[TMP23]], 5.000000e-01
; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], i64 [[INDVARS_IV]]
; AVX512-NEXT: store float [[ADD]], float addrspace(1)* [[ARRAYIDX5]], align 4
; AVX512-NEXT: br label [[FOR_INC]]
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH]], label [[VECTOR_SCEVCHECK:%.*]]
; CHECK: vector.scevcheck:
; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP8]], [[TMP0]]
-; CHECK-NEXT: [[TMP11:%.*]] = icmp slt i32 [[TMP10]], [[TMP8]]
-; CHECK-NEXT: br i1 [[TMP11]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT: [[TMP13:%.*]] = icmp slt i32 [[TMP10]], [[TMP8]]
+; CHECK-NEXT: br i1 [[TMP13]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 4
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[INDEX]] to i32
-; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[TMP12]], 0
-; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[ADD_US]], [[TMP13]]
-; CHECK-NEXT: [[TMP15:%.*]] = sext i32 [[TMP14]] to i64
-; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP15]]
-; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP16]], i32 0
-; CHECK-NEXT: [[TMP18:%.*]] = bitcast i32* [[TMP17]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP18]], align 4
-; CHECK-NEXT: [[TMP19:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i32> [[TMP19]], i32 0
-; CHECK-NEXT: store i32 [[TMP20]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
-; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i32> [[TMP19]], i32 1
-; CHECK-NEXT: store i32 [[TMP21]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
-; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x i32> [[TMP19]], i32 2
-; CHECK-NEXT: store i32 [[TMP22]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
-; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x i32> [[TMP19]], i32 3
-; CHECK-NEXT: store i32 [[TMP23]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
+; CHECK-NEXT: [[TMP16:%.*]] = trunc i64 [[INDEX]] to i32
+; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP16]], 0
+; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[ADD_US]], [[TMP17]]
+; CHECK-NEXT: [[TMP19:%.*]] = sext i32 [[TMP18]] to i64
+; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP19]]
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 0
+; CHECK-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP22]], align 4
+; CHECK-NEXT: [[TMP23:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i32> [[TMP23]], i32 0
+; CHECK-NEXT: store i32 [[TMP24]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
+; CHECK-NEXT: [[TMP25:%.*]] = extractelement <4 x i32> [[TMP23]], i32 1
+; CHECK-NEXT: store i32 [[TMP25]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
+; CHECK-NEXT: [[TMP26:%.*]] = extractelement <4 x i32> [[TMP23]], i32 2
+; CHECK-NEXT: store i32 [[TMP26]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
+; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i32> [[TMP23]], i32 3
+; CHECK-NEXT: store i32 [[TMP27]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_US]], label [[SCALAR_PH]]
; AVX-NEXT: [[PREDPHI]] = select <4 x i1> [[TMP4]], <4 x double> [[TMP5]], <4 x double> [[VEC_PHI]]
; AVX-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; AVX-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32
-; AVX-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; AVX-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
; AVX: middle.block:
; AVX-NEXT: [[TMP8:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[PREDPHI]])
; AVX-NEXT: [[CMP_N:%.*]] = icmp eq i32 32, 32
; AVX-NEXT: [[TOT_NEXT]] = phi double [ [[TOT]], [[NO_ADD]] ], [ [[TOT_NEW]], [[DO_ADD]] ]
; AVX-NEXT: [[I_NEXT]] = add i32 [[I]], 1
; AVX-NEXT: [[AGAIN:%.*]] = icmp ult i32 [[I_NEXT]], 32
-; AVX-NEXT: br i1 [[AGAIN]], label [[LOOP]], label [[DONE]], !llvm.loop [[LOOP2:![0-9]+]]
+; AVX-NEXT: br i1 [[AGAIN]], label [[LOOP]], label [[DONE]], [[LOOP2:!llvm.loop !.*]]
; AVX: done:
; AVX-NEXT: [[TOT_NEXT_LCSSA:%.*]] = phi double [ [[TOT_NEXT]], [[NEXT_ITER]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ]
; AVX-NEXT: ret double [[TOT_NEXT_LCSSA]]
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -loop-vectorize -S | FileCheck %s
; This test checks that gather/scatter not used for i128 data type.
@str = private unnamed_addr constant [45 x i8] c" PASS.....Y3 1/1 (BUBBLE SORT), X(25) = 5085\00"
; Function Attrs: noinline nounwind uwtable
-declare i32 @y3inner() #0
+declare i32 @y3inner() #0
define i32 @main() local_unnamed_addr #0 {
-; CHECK-LABEL: @main(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[DO_BODY:%.*]]
-; CHECK: do.body:
-; CHECK-NEXT: [[J_0:%.*]] = phi i128 [ 99999, [[ENTRY:%.*]] ], [ [[ADD10:%.*]], [[DO_BODY]] ]
-; CHECK-NEXT: [[I_0:%.*]] = phi i128 [ 1, [[ENTRY]] ], [ [[ADD11:%.*]], [[DO_BODY]] ]
-; CHECK-NEXT: [[AND:%.*]] = and i128 [[J_0]], 32767
-; CHECK-NEXT: [[IDXPROM:%.*]] = trunc i128 [[I_0]] to i64
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [151 x i128], [151 x i128]* @x, i64 0, i64 [[IDXPROM]]
-; CHECK-NEXT: store i128 [[AND]], i128* [[ARRAYIDX]], align 16
-; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i128 [[J_0]], 11111
-; CHECK-NEXT: [[AND1:%.*]] = and i128 [[ADD]], 32767
-; CHECK-NEXT: [[ADD2:%.*]] = add nuw nsw i128 [[I_0]], 1
-; CHECK-NEXT: [[IDXPROM3:%.*]] = trunc i128 [[ADD2]] to i64
-; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [151 x i128], [151 x i128]* @x, i64 0, i64 [[IDXPROM3]]
-; CHECK-NEXT: store i128 [[AND1]], i128* [[ARRAYIDX4]], align 16
-; CHECK-NEXT: [[ADD5:%.*]] = add nuw nsw i128 [[J_0]], 22222
-; CHECK-NEXT: [[AND6:%.*]] = and i128 [[ADD5]], 32767
-; CHECK-NEXT: [[ADD7:%.*]] = add nuw nsw i128 [[I_0]], 2
-; CHECK-NEXT: [[IDXPROM8:%.*]] = trunc i128 [[ADD7]] to i64
-; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [151 x i128], [151 x i128]* @x, i64 0, i64 [[IDXPROM8]]
-; CHECK-NEXT: store i128 [[AND6]], i128* [[ARRAYIDX9]], align 16
-; CHECK-NEXT: [[ADD10]] = add nuw nsw i128 [[J_0]], 33333
-; CHECK-NEXT: [[ADD11]] = add nuw nsw i128 [[I_0]], 3
-; CHECK-NEXT: [[CMP:%.*]] = icmp slt i128 [[ADD11]], 149
-; CHECK-NEXT: br i1 [[CMP]], label [[DO_BODY]], label [[DO_END:%.*]]
-; CHECK: do.end:
-; CHECK-NEXT: store i128 1766649, i128* getelementptr inbounds ([151 x i128], [151 x i128]* @x, i64 0, i64 149), align 16
-; CHECK-NEXT: store i128 1766649, i128* getelementptr inbounds ([151 x i128], [151 x i128]* @x, i64 0, i64 150), align 16
-; CHECK-NEXT: [[CALL:%.*]] = tail call i32 @y3inner()
-; CHECK-NEXT: [[TMP0:%.*]] = load i128, i128* getelementptr inbounds ([151 x i128], [151 x i128]* @x, i64 0, i64 25), align 16
-; CHECK-NEXT: [[CMP12:%.*]] = icmp eq i128 [[TMP0]], 5085
-; CHECK-NEXT: br i1 [[CMP12]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
-; CHECK: if.then:
-; CHECK-NEXT: [[PUTS:%.*]] = tail call i32 @puts(i8* getelementptr inbounds ([45 x i8], [45 x i8]* @str, i64 0, i64 0))
-; CHECK-NEXT: br label [[IF_END:%.*]]
-; CHECK: if.else:
-; CHECK-NEXT: [[COERCE_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i128 [[TMP0]] to i64
-; CHECK-NEXT: [[COERCE_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i128 [[TMP0]], 64
-; CHECK-NEXT: [[COERCE_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i128 [[COERCE_SROA_2_0_EXTRACT_SHIFT]] to i64
-; CHECK-NEXT: [[CALL14:%.*]] = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([44 x i8], [44 x i8]* @.str.1, i64 0, i64 0), i64 [[COERCE_SROA_0_0_EXTRACT_TRUNC]], i64 [[COERCE_SROA_2_0_EXTRACT_TRUNC]])
-; CHECK-NEXT: br label [[IF_END]]
-; CHECK: if.end:
-; CHECK-NEXT: ret i32 0
-;
entry:
br label %do.body
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -loop-vectorize -mtriple x86_64 -S | FileCheck %s
%struct.ST4 = type { i32, i32, i32, i32 }
; Test from https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=7560
define void @test1(%struct.ST4* noalias %B) {
-; CHECK-LABEL: @test1(
+; CHECK-LABEL: @test1
; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds [[STRUCT_ST4:%.*]], %struct.ST4* [[B:%.*]], i64 [[INDVARS_IV]], i32 0
-; CHECK-NEXT: store i32 65536, i32* [[P1]], align 4
-; CHECK-NEXT: [[P2:%.*]] = getelementptr i32, i32* [[P1]], i32 -2147483648
-; CHECK-NEXT: store i32 65536, i32* [[P2]], align 4
-; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds [[STRUCT_ST4]], %struct.ST4* [[B]], i64 [[INDVARS_IV]], i32 2
-; CHECK-NEXT: store i32 10, i32* [[P3]], align 4
-; CHECK-NEXT: [[P4:%.*]] = getelementptr inbounds [[STRUCT_ST4]], %struct.ST4* [[B]], i64 [[INDVARS_IV]], i32 3
-; CHECK-NEXT: store i32 12, i32* [[P4]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: ret void
-;
+; CHECK-NEXT: br label %for.body
+; CHECK-LABEL: for.body:
+; CHECK: store i32
+; CHECK: store i32
+; CHECK: store i32
+; CHECK: store i32
+; CHECK-NOT: store
+;
entry:
br label %for.body
; Make sure interleave groups with a key being the special 'empty' value for
; the map do not cause a crash.
define void @test_gap_empty_key() {
-; CHECK-LABEL: @test_gap_empty_key(
+; CHECK-LABEL: @test_gap_empty_key()
; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 1, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x i32], [3 x i32]* undef, i64 0, i64 [[IV_NEXT]]
-; CHECK-NEXT: [[G2:%.*]] = getelementptr i32, i32* [[ARRAYIDX]], i64 [[IV_NEXT]]
-; CHECK-NEXT: [[G9:%.*]] = getelementptr i32, i32* [[G2]], i32 -2147483647
-; CHECK-NEXT: store i32 0, i32* [[G2]], align 4
-; CHECK-NEXT: store i32 1, i32* [[G9]], align 4
-; CHECK-NEXT: [[CMP:%.*]] = icmp ule i64 [[IV]], 1000
-; CHECK-NEXT: br i1 false, label [[FOR_BODY]], label [[EXIT:%.*]]
-; CHECK: exit:
-; CHECK-NEXT: ret void
-;
+; CHECK-NEXT: br label %for.body
+; CHECK-LABEL: for.body:
+; CHECK: store i32
+; CHECK: store i32
+; CHECK-NOT: store
+;
entry:
br label %for.body
; Make sure interleave groups with a key being the special 'tombstone' value for
; the map do not cause a crash.
define void @test_tombstone_key() {
-; CHECK-LABEL: @test_tombstone_key(
+; CHECK-LABEL: @test_tombstone_key()
; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 1, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 1
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x i32], [3 x i32]* undef, i64 0, i64 [[IV_NEXT]]
-; CHECK-NEXT: [[G2:%.*]] = getelementptr i32, i32* [[ARRAYIDX]], i64 [[IV_NEXT]]
-; CHECK-NEXT: [[G9:%.*]] = getelementptr i32, i32* [[G2]], i32 -2147483648
-; CHECK-NEXT: store i32 0, i32* [[G2]], align 4
-; CHECK-NEXT: store i32 1, i32* [[G9]], align 4
-; CHECK-NEXT: [[CMP:%.*]] = icmp ule i64 [[IV]], 1000
-; CHECK-NEXT: br i1 false, label [[FOR_BODY]], label [[EXIT:%.*]]
-; CHECK: exit:
-; CHECK-NEXT: ret void
-;
+; CHECK-NEXT: br label %for.body
+; CHECK-LABEL: for.body:
+; CHECK: store i32
+; CHECK: store i32
+; CHECK-NOT: store
+;
entry:
br label %for.body
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK: vec.epilog.iter.check:
-; CHECK-NEXT: [[IND_END27:%.*]] = getelementptr i16, i16* [[PDST]], i64 [[N_VEC]]
-; CHECK-NEXT: [[IND_END24:%.*]] = getelementptr i16, i16* [[PSRC]], i64 [[N_VEC]]
-; CHECK-NEXT: [[CAST_CRD20:%.*]] = trunc i64 [[N_VEC]] to i32
-; CHECK-NEXT: [[IND_END21:%.*]] = sub i32 [[BLOCKSIZE]], [[CAST_CRD20]]
+; CHECK-NEXT: [[IND_END29:%.*]] = getelementptr i16, i16* [[PDST]], i64 [[N_VEC]]
+; CHECK-NEXT: [[IND_END26:%.*]] = getelementptr i16, i16* [[PSRC]], i64 [[N_VEC]]
+; CHECK-NEXT: [[CAST_CRD22:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT: [[IND_END23:%.*]] = sub i32 [[BLOCKSIZE]], [[CAST_CRD22]]
; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[TMP2]], 56
; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; CHECK-NEXT: [[N_VEC19:%.*]] = and i64 [[TMP2]], 8589934584
; CHECK-NEXT: [[CAST_CRD:%.*]] = trunc i64 [[N_VEC19]] to i32
; CHECK-NEXT: [[IND_END:%.*]] = sub i32 [[BLOCKSIZE]], [[CAST_CRD]]
-; CHECK-NEXT: [[IND_END23:%.*]] = getelementptr i16, i16* [[PSRC]], i64 [[N_VEC19]]
-; CHECK-NEXT: [[IND_END26:%.*]] = getelementptr i16, i16* [[PDST]], i64 [[N_VEC19]]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT33:%.*]] = insertelement <8 x i16> poison, i16 [[OFFSET]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT34:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT33]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[IND_END25:%.*]] = getelementptr i16, i16* [[PSRC]], i64 [[N_VEC19]]
+; CHECK-NEXT: [[IND_END28:%.*]] = getelementptr i16, i16* [[PDST]], i64 [[N_VEC19]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT35:%.*]] = insertelement <8 x i16> poison, i16 [[OFFSET]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT36:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT35]], <8 x i16> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; CHECK: vec.epilog.vector.body:
-; CHECK-NEXT: [[INDEX29:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT35:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT: [[NEXT_GEP30:%.*]] = getelementptr i16, i16* [[PSRC]], i64 [[INDEX29]]
-; CHECK-NEXT: [[NEXT_GEP31:%.*]] = getelementptr i16, i16* [[PDST]], i64 [[INDEX29]]
-; CHECK-NEXT: [[TMP22:%.*]] = bitcast i16* [[NEXT_GEP30]] to <8 x i16>*
-; CHECK-NEXT: [[WIDE_LOAD32:%.*]] = load <8 x i16>, <8 x i16>* [[TMP22]], align 2
-; CHECK-NEXT: [[TMP23:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[WIDE_LOAD32]], <8 x i16> [[BROADCAST_SPLAT34]])
-; CHECK-NEXT: [[TMP24:%.*]] = bitcast i16* [[NEXT_GEP31]] to <8 x i16>*
-; CHECK-NEXT: store <8 x i16> [[TMP23]], <8 x i16>* [[TMP24]], align 2
-; CHECK-NEXT: [[INDEX_NEXT35]] = add nuw i64 [[INDEX29]], 8
-; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT35]], [[N_VEC19]]
-; CHECK-NEXT: br i1 [[TMP25]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT: [[INDEX20:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT21:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT: [[NEXT_GEP32:%.*]] = getelementptr i16, i16* [[PSRC]], i64 [[INDEX20]]
+; CHECK-NEXT: [[NEXT_GEP33:%.*]] = getelementptr i16, i16* [[PDST]], i64 [[INDEX20]]
+; CHECK-NEXT: [[TMP25:%.*]] = bitcast i16* [[NEXT_GEP32]] to <8 x i16>*
+; CHECK-NEXT: [[WIDE_LOAD34:%.*]] = load <8 x i16>, <8 x i16>* [[TMP25]], align 2
+; CHECK-NEXT: [[TMP26:%.*]] = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> [[WIDE_LOAD34]], <8 x i16> [[BROADCAST_SPLAT36]])
+; CHECK-NEXT: [[TMP27:%.*]] = bitcast i16* [[NEXT_GEP33]] to <8 x i16>*
+; CHECK-NEXT: store <8 x i16> [[TMP26]], <8 x i16>* [[TMP27]], align 2
+; CHECK-NEXT: [[INDEX_NEXT21]] = add nuw i64 [[INDEX20]], 8
+; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT21]], [[N_VEC19]]
+; CHECK-NEXT: br i1 [[TMP28]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
; CHECK: vec.epilog.middle.block:
-; CHECK-NEXT: [[CMP_N28:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC19]]
-; CHECK-NEXT: br i1 [[CMP_N28]], label [[WHILE_END]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK-NEXT: [[CMP_N30:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC19]]
+; CHECK-NEXT: br i1 [[CMP_N30]], label [[WHILE_END]], label [[VEC_EPILOG_SCALAR_PH]]
; CHECK: vec.epilog.scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END21]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ]
-; CHECK-NEXT: [[BC_RESUME_VAL22:%.*]] = phi i16* [ [[IND_END23]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END24]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PSRC]], [[ITER_CHECK]] ]
-; CHECK-NEXT: [[BC_RESUME_VAL25:%.*]] = phi i16* [ [[IND_END26]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END27]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PDST]], [[ITER_CHECK]] ]
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END23]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ]
+; CHECK-NEXT: [[BC_RESUME_VAL24:%.*]] = phi i16* [ [[IND_END25]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END26]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PSRC]], [[ITER_CHECK]] ]
+; CHECK-NEXT: [[BC_RESUME_VAL27:%.*]] = phi i16* [ [[IND_END28]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END29]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PDST]], [[ITER_CHECK]] ]
; CHECK-NEXT: br label [[WHILE_BODY:%.*]]
; CHECK: while.body:
; CHECK-NEXT: [[BLKCNT_09:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ]
-; CHECK-NEXT: [[PSRC_ADDR_08:%.*]] = phi i16* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL22]], [[VEC_EPILOG_SCALAR_PH]] ]
-; CHECK-NEXT: [[PDST_ADDR_07:%.*]] = phi i16* [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL25]], [[VEC_EPILOG_SCALAR_PH]] ]
+; CHECK-NEXT: [[PSRC_ADDR_08:%.*]] = phi i16* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL24]], [[VEC_EPILOG_SCALAR_PH]] ]
+; CHECK-NEXT: [[PDST_ADDR_07:%.*]] = phi i16* [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL27]], [[VEC_EPILOG_SCALAR_PH]] ]
; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i16, i16* [[PSRC_ADDR_08]], i64 1
-; CHECK-NEXT: [[TMP26:%.*]] = load i16, i16* [[PSRC_ADDR_08]], align 2
-; CHECK-NEXT: [[TMP27:%.*]] = tail call i16 @llvm.uadd.sat.i16(i16 [[TMP26]], i16 [[OFFSET]])
+; CHECK-NEXT: [[TMP29:%.*]] = load i16, i16* [[PSRC_ADDR_08]], align 2
+; CHECK-NEXT: [[TMP30:%.*]] = tail call i16 @llvm.uadd.sat.i16(i16 [[TMP29]], i16 [[OFFSET]])
; CHECK-NEXT: [[INCDEC_PTR3]] = getelementptr inbounds i16, i16* [[PDST_ADDR_07]], i64 1
-; CHECK-NEXT: store i16 [[TMP27]], i16* [[PDST_ADDR_07]], align 2
+; CHECK-NEXT: store i16 [[TMP30]], i16* [[PDST_ADDR_07]], align 2
; CHECK-NEXT: [[DEC]] = add i32 [[BLKCNT_09]], -1
; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0
; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK: vec.epilog.iter.check:
-; CHECK-NEXT: [[IND_END27:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[N_VEC]]
-; CHECK-NEXT: [[IND_END24:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[N_VEC]]
-; CHECK-NEXT: [[CAST_CRD20:%.*]] = trunc i64 [[N_VEC]] to i32
-; CHECK-NEXT: [[IND_END21:%.*]] = sub i32 [[BLOCKSIZE]], [[CAST_CRD20]]
+; CHECK-NEXT: [[IND_END29:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[N_VEC]]
+; CHECK-NEXT: [[IND_END26:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[N_VEC]]
+; CHECK-NEXT: [[CAST_CRD22:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT: [[IND_END23:%.*]] = sub i32 [[BLOCKSIZE]], [[CAST_CRD22]]
; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = and i64 [[TMP2]], 112
; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; CHECK-NEXT: [[N_VEC19:%.*]] = and i64 [[TMP2]], 8589934576
; CHECK-NEXT: [[CAST_CRD:%.*]] = trunc i64 [[N_VEC19]] to i32
; CHECK-NEXT: [[IND_END:%.*]] = sub i32 [[BLOCKSIZE]], [[CAST_CRD]]
-; CHECK-NEXT: [[IND_END23:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[N_VEC19]]
-; CHECK-NEXT: [[IND_END26:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[N_VEC19]]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT33:%.*]] = insertelement <16 x i8> poison, i8 [[OFFSET]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT34:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT33]], <16 x i8> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT: [[IND_END25:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[N_VEC19]]
+; CHECK-NEXT: [[IND_END28:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[N_VEC19]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT35:%.*]] = insertelement <16 x i8> poison, i8 [[OFFSET]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT36:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT35]], <16 x i8> poison, <16 x i32> zeroinitializer
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; CHECK: vec.epilog.vector.body:
-; CHECK-NEXT: [[INDEX29:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT35:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT: [[NEXT_GEP30:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[INDEX29]]
-; CHECK-NEXT: [[NEXT_GEP31:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[INDEX29]]
-; CHECK-NEXT: [[TMP22:%.*]] = bitcast i8* [[NEXT_GEP30]] to <16 x i8>*
-; CHECK-NEXT: [[WIDE_LOAD32:%.*]] = load <16 x i8>, <16 x i8>* [[TMP22]], align 2
-; CHECK-NEXT: [[TMP23:%.*]] = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> [[WIDE_LOAD32]], <16 x i8> [[WIDE_LOAD32]], <16 x i8> [[BROADCAST_SPLAT34]])
-; CHECK-NEXT: [[TMP24:%.*]] = bitcast i8* [[NEXT_GEP31]] to <16 x i8>*
-; CHECK-NEXT: store <16 x i8> [[TMP23]], <16 x i8>* [[TMP24]], align 2
-; CHECK-NEXT: [[INDEX_NEXT35]] = add nuw i64 [[INDEX29]], 16
-; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT35]], [[N_VEC19]]
-; CHECK-NEXT: br i1 [[TMP25]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT: [[INDEX20:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT21:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT: [[NEXT_GEP32:%.*]] = getelementptr i8, i8* [[PSRC]], i64 [[INDEX20]]
+; CHECK-NEXT: [[NEXT_GEP33:%.*]] = getelementptr i8, i8* [[PDST]], i64 [[INDEX20]]
+; CHECK-NEXT: [[TMP25:%.*]] = bitcast i8* [[NEXT_GEP32]] to <16 x i8>*
+; CHECK-NEXT: [[WIDE_LOAD34:%.*]] = load <16 x i8>, <16 x i8>* [[TMP25]], align 2
+; CHECK-NEXT: [[TMP26:%.*]] = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> [[WIDE_LOAD34]], <16 x i8> [[WIDE_LOAD34]], <16 x i8> [[BROADCAST_SPLAT36]])
+; CHECK-NEXT: [[TMP27:%.*]] = bitcast i8* [[NEXT_GEP33]] to <16 x i8>*
+; CHECK-NEXT: store <16 x i8> [[TMP26]], <16 x i8>* [[TMP27]], align 2
+; CHECK-NEXT: [[INDEX_NEXT21]] = add nuw i64 [[INDEX20]], 16
+; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT21]], [[N_VEC19]]
+; CHECK-NEXT: br i1 [[TMP28]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: vec.epilog.middle.block:
-; CHECK-NEXT: [[CMP_N28:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC19]]
-; CHECK-NEXT: br i1 [[CMP_N28]], label [[WHILE_END]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK-NEXT: [[CMP_N30:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC19]]
+; CHECK-NEXT: br i1 [[CMP_N30]], label [[WHILE_END]], label [[VEC_EPILOG_SCALAR_PH]]
; CHECK: vec.epilog.scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END21]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ]
-; CHECK-NEXT: [[BC_RESUME_VAL22:%.*]] = phi i8* [ [[IND_END23]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END24]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PSRC]], [[ITER_CHECK]] ]
-; CHECK-NEXT: [[BC_RESUME_VAL25:%.*]] = phi i8* [ [[IND_END26]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END27]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PDST]], [[ITER_CHECK]] ]
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END23]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[BLOCKSIZE]], [[ITER_CHECK]] ]
+; CHECK-NEXT: [[BC_RESUME_VAL24:%.*]] = phi i8* [ [[IND_END25]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END26]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PSRC]], [[ITER_CHECK]] ]
+; CHECK-NEXT: [[BC_RESUME_VAL27:%.*]] = phi i8* [ [[IND_END28]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END29]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[PDST]], [[ITER_CHECK]] ]
; CHECK-NEXT: br label [[WHILE_BODY:%.*]]
; CHECK: while.body:
; CHECK-NEXT: [[BLKCNT_09:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ]
-; CHECK-NEXT: [[PSRC_ADDR_08:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL22]], [[VEC_EPILOG_SCALAR_PH]] ]
-; CHECK-NEXT: [[PDST_ADDR_07:%.*]] = phi i8* [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL25]], [[VEC_EPILOG_SCALAR_PH]] ]
+; CHECK-NEXT: [[PSRC_ADDR_08:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL24]], [[VEC_EPILOG_SCALAR_PH]] ]
+; CHECK-NEXT: [[PDST_ADDR_07:%.*]] = phi i8* [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL27]], [[VEC_EPILOG_SCALAR_PH]] ]
; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, i8* [[PSRC_ADDR_08]], i64 1
-; CHECK-NEXT: [[TMP26:%.*]] = load i8, i8* [[PSRC_ADDR_08]], align 2
-; CHECK-NEXT: [[TMP27:%.*]] = tail call i8 @llvm.fshl.i8(i8 [[TMP26]], i8 [[TMP26]], i8 [[OFFSET]])
+; CHECK-NEXT: [[TMP29:%.*]] = load i8, i8* [[PSRC_ADDR_08]], align 2
+; CHECK-NEXT: [[TMP30:%.*]] = tail call i8 @llvm.fshl.i8(i8 [[TMP29]], i8 [[TMP29]], i8 [[OFFSET]])
; CHECK-NEXT: [[INCDEC_PTR3]] = getelementptr inbounds i8, i8* [[PDST_ADDR_07]], i64 1
-; CHECK-NEXT: store i8 [[TMP27]], i8* [[PDST_ADDR_07]], align 2
+; CHECK-NEXT: store i8 [[TMP30]], i8* [[PDST_ADDR_07]], align 2
; CHECK-NEXT: [[DEC]] = add i32 [[BLKCNT_09]], -1
; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0
; CHECK-NEXT: br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK_NOT_NOT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT: [[N_VEC12:%.*]] = and i64 [[SMAX6]], 9223372036854775800
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT16:%.*]] = insertelement <8 x i32*> poison, i32* [[A]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT17:%.*]] = shufflevector <8 x i32*> [[BROADCAST_SPLATINSERT16]], <8 x i32*> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT18:%.*]] = insertelement <8 x i32> poison, i32 [[NTRUNC]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT19:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT18]], <8 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[N_VEC13:%.*]] = and i64 [[SMAX6]], 9223372036854775800
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT17:%.*]] = insertelement <8 x i32*> poison, i32* [[A]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT18:%.*]] = shufflevector <8 x i32*> [[BROADCAST_SPLATINSERT17]], <8 x i32*> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT19:%.*]] = insertelement <8 x i32> poison, i32 [[NTRUNC]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT20:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT19]], <8 x i32> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; CHECK: vec.epilog.vector.body:
-; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT22:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT23:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[OFFSET_IDX]]
; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <8 x i32>*
-; CHECK-NEXT: store <8 x i32> [[BROADCAST_SPLAT19]], <8 x i32>* [[TMP6]], align 4
-; CHECK-NEXT: [[INDEX_NEXT22]] = add nuw i64 [[OFFSET_IDX]], 8
-; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT22]], [[N_VEC12]]
+; CHECK-NEXT: store <8 x i32> [[BROADCAST_SPLAT20]], <8 x i32>* [[TMP6]], align 4
+; CHECK-NEXT: [[INDEX_NEXT23]] = add nuw i64 [[OFFSET_IDX]], 8
+; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT23]], [[N_VEC13]]
; CHECK-NEXT: br i1 [[TMP7]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK: vec.epilog.middle.block:
-; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <8 x i32*> [[BROADCAST_SPLAT17]], zeroinitializer
-; CHECK-NEXT: [[WIDE_MASKED_GATHER20:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[BROADCAST_SPLAT17]], i32 4, <8 x i1> [[TMP8]], <8 x i32> undef)
-; CHECK-NEXT: [[PREDPHI21:%.*]] = select <8 x i1> [[TMP8]], <8 x i32> [[WIDE_MASKED_GATHER20]], <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 1>
-; CHECK-NEXT: [[CMP_N13:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC12]]
-; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[PREDPHI21]], i64 7
-; CHECK-NEXT: br i1 [[CMP_N13]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK-NEXT: [[TMP8:%.*]] = icmp ne <8 x i32*> [[BROADCAST_SPLAT18]], zeroinitializer
+; CHECK-NEXT: [[WIDE_MASKED_GATHER21:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[BROADCAST_SPLAT18]], i32 4, <8 x i1> [[TMP8]], <8 x i32> undef)
+; CHECK-NEXT: [[PREDPHI22:%.*]] = select <8 x i1> [[TMP8]], <8 x i32> [[WIDE_MASKED_GATHER21]], <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 1>
+; CHECK-NEXT: [[CMP_N14:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC13]]
+; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x i32> [[PREDPHI22]], i64 7
+; CHECK-NEXT: br i1 [[CMP_N14]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
; CHECK: vec.epilog.scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC12]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP13]], [[VEC_EPILOG_ITER_CHECK]] ]
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ]
-; CHECK-NEXT: [[N_VEC17:%.*]] = and i64 [[SMAX6]], 9223372036854775800
+; CHECK-NEXT: [[N_VEC18:%.*]] = and i64 [[SMAX6]], 9223372036854775800
; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x i32> <i32 poison, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, i32 [[BC_MERGE_RDX]], i64 0
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; CHECK: vec.epilog.vector.body:
-; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT22:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI20:%.*]] = phi <8 x i32> [ [[TMP14]], [[VEC_EPILOG_PH]] ], [ [[TMP17:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT: [[INDEX20:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT23:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI21:%.*]] = phi <8 x i32> [ [[TMP14]], [[VEC_EPILOG_PH]] ], [ [[TMP17:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX20]]
; CHECK-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <8 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD21:%.*]] = load <8 x i32>, <8 x i32>* [[TMP16]], align 8
-; CHECK-NEXT: [[TMP17]] = add <8 x i32> [[VEC_PHI20]], [[WIDE_LOAD21]]
+; CHECK-NEXT: [[WIDE_LOAD22:%.*]] = load <8 x i32>, <8 x i32>* [[TMP16]], align 8
+; CHECK-NEXT: [[TMP17]] = add <8 x i32> [[VEC_PHI21]], [[WIDE_LOAD22]]
; CHECK-NEXT: store i32 [[NTRUNC]], i32* [[A]], align 4
-; CHECK-NEXT: [[INDEX_NEXT22]] = add nuw i64 [[OFFSET_IDX]], 8
-; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT22]], [[N_VEC17]]
+; CHECK-NEXT: [[INDEX_NEXT23]] = add nuw i64 [[INDEX20]], 8
+; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT23]], [[N_VEC18]]
; CHECK-NEXT: br i1 [[TMP18]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK: vec.epilog.middle.block:
; CHECK-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP17]])
-; CHECK-NEXT: [[CMP_N18:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC17]]
-; CHECK-NEXT: br i1 [[CMP_N18]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK-NEXT: [[CMP_N19:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC18]]
+; CHECK-NEXT: br i1 [[CMP_N19]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
; CHECK: vec.epilog.scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC17]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX23:%.*]] = phi i32 [ [[TMP19]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP13]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK]] ]
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC18]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX24:%.*]] = phi i32 [ [[TMP19]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP13]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ]
-; CHECK-NEXT: [[T0:%.*]] = phi i32 [ [[T3:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX23]], [[VEC_EPILOG_SCALAR_PH]] ]
+; CHECK-NEXT: [[T0:%.*]] = phi i32 [ [[T3:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX24]], [[VEC_EPILOG_SCALAR_PH]] ]
; CHECK-NEXT: [[T1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I]]
; CHECK-NEXT: [[T2:%.*]] = load i32, i32* [[T1]], align 8
; CHECK-NEXT: [[T3]] = add i32 [[T0]], [[T2]]
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK_NOT_NOT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT: [[N_VEC13:%.*]] = and i64 [[SMAX6]], 9223372036854775800
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT17:%.*]] = insertelement <8 x i32> poison, i32 [[K]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT18:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT17]], <8 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT19:%.*]] = insertelement <8 x i32> poison, i32 [[NTRUNC]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT20:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT19]], <8 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT21:%.*]] = insertelement <8 x i32*> poison, i32* [[A]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT22:%.*]] = shufflevector <8 x i32*> [[BROADCAST_SPLATINSERT21]], <8 x i32*> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[N_VEC14:%.*]] = and i64 [[SMAX6]], 9223372036854775800
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT18:%.*]] = insertelement <8 x i32> poison, i32 [[K]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT19:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT18]], <8 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT20:%.*]] = insertelement <8 x i32> poison, i32 [[NTRUNC]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT21:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT20]], <8 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT22:%.*]] = insertelement <8 x i32*> poison, i32* [[A]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT23:%.*]] = shufflevector <8 x i32*> [[BROADCAST_SPLATINSERT22]], <8 x i32*> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; CHECK: vec.epilog.vector.body:
-; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT23:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT: [[INDEX16:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT24:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX16]]
; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <8 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD16:%.*]] = load <8 x i32>, <8 x i32>* [[TMP6]], align 8
-; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD16]], [[BROADCAST_SPLAT18]]
+; CHECK-NEXT: [[WIDE_LOAD17:%.*]] = load <8 x i32>, <8 x i32>* [[TMP6]], align 8
+; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD17]], [[BROADCAST_SPLAT19]]
; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP5]] to <8 x i32>*
-; CHECK-NEXT: store <8 x i32> [[BROADCAST_SPLAT20]], <8 x i32>* [[TMP8]], align 4
-; CHECK-NEXT: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> [[BROADCAST_SPLAT20]], <8 x i32*> [[BROADCAST_SPLAT22]], i32 4, <8 x i1> [[TMP7]])
-; CHECK-NEXT: [[INDEX_NEXT23]] = add nuw i64 [[OFFSET_IDX]], 8
-; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT23]], [[N_VEC13]]
+; CHECK-NEXT: store <8 x i32> [[BROADCAST_SPLAT21]], <8 x i32>* [[TMP8]], align 4
+; CHECK-NEXT: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> [[BROADCAST_SPLAT21]], <8 x i32*> [[BROADCAST_SPLAT23]], i32 4, <8 x i1> [[TMP7]])
+; CHECK-NEXT: [[INDEX_NEXT24]] = add nuw i64 [[INDEX16]], 8
+; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT24]], [[N_VEC14]]
; CHECK-NEXT: br i1 [[TMP9]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK: vec.epilog.middle.block:
-; CHECK-NEXT: [[CMP_N14:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC13]]
-; CHECK-NEXT: br i1 [[CMP_N14]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK-NEXT: [[CMP_N15:%.*]] = icmp eq i64 [[SMAX6]], [[N_VEC14]]
+; CHECK-NEXT: br i1 [[CMP_N15]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
; CHECK: vec.epilog.scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC13]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC14]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ]
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK_NOT_NOT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; CHECK: vec.epilog.ph:
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT: [[N_VEC23:%.*]] = and i64 [[SMAX16]], 9223372036854775800
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT27:%.*]] = insertelement <8 x i32> poison, i32 [[K]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT28:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT27]], <8 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT29:%.*]] = insertelement <8 x i32> poison, i32 [[NTRUNC]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT30:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT29]], <8 x i32> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT32:%.*]] = insertelement <8 x i32*> poison, i32* [[A]], i64 0
-; CHECK-NEXT: [[BROADCAST_SPLAT33:%.*]] = shufflevector <8 x i32*> [[BROADCAST_SPLATINSERT32]], <8 x i32*> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[N_VEC24:%.*]] = and i64 [[SMAX16]], 9223372036854775800
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT28:%.*]] = insertelement <8 x i32> poison, i32 [[K]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT29:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT28]], <8 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT30:%.*]] = insertelement <8 x i32> poison, i32 [[NTRUNC]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT31:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT30]], <8 x i32> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT33:%.*]] = insertelement <8 x i32*> poison, i32* [[A]], i64 0
+; CHECK-NEXT: [[BROADCAST_SPLAT34:%.*]] = shufflevector <8 x i32*> [[BROADCAST_SPLATINSERT33]], <8 x i32*> poison, <8 x i32> zeroinitializer
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; CHECK: vec.epilog.vector.body:
-; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT34:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT: [[INDEX26:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT35:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX26]]
; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <8 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD26:%.*]] = load <8 x i32>, <8 x i32>* [[TMP8]], align 8
-; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD26]], [[BROADCAST_SPLAT28]]
+; CHECK-NEXT: [[WIDE_LOAD27:%.*]] = load <8 x i32>, <8 x i32>* [[TMP8]], align 8
+; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD27]], [[BROADCAST_SPLAT29]]
; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP7]] to <8 x i32>*
-; CHECK-NEXT: store <8 x i32> [[BROADCAST_SPLAT30]], <8 x i32>* [[TMP10]], align 4
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[C]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT: store <8 x i32> [[BROADCAST_SPLAT31]], <8 x i32>* [[TMP10]], align 4
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[C]], i64 [[INDEX26]]
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <8 x i32>*
-; CHECK-NEXT: [[WIDE_MASKED_LOAD31:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP12]], i32 8, <8 x i1> [[TMP9]], <8 x i32> poison)
-; CHECK-NEXT: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> [[WIDE_MASKED_LOAD31]], <8 x i32*> [[BROADCAST_SPLAT33]], i32 4, <8 x i1> [[TMP9]])
-; CHECK-NEXT: [[INDEX_NEXT34]] = add nuw i64 [[OFFSET_IDX]], 8
-; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT34]], [[N_VEC23]]
+; CHECK-NEXT: [[WIDE_MASKED_LOAD32:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP12]], i32 8, <8 x i1> [[TMP9]], <8 x i32> poison)
+; CHECK-NEXT: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> [[WIDE_MASKED_LOAD32]], <8 x i32*> [[BROADCAST_SPLAT34]], i32 4, <8 x i1> [[TMP9]])
+; CHECK-NEXT: [[INDEX_NEXT35]] = add nuw i64 [[INDEX26]], 8
+; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT35]], [[N_VEC24]]
; CHECK-NEXT: br i1 [[TMP13]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
; CHECK: vec.epilog.middle.block:
-; CHECK-NEXT: [[CMP_N24:%.*]] = icmp eq i64 [[SMAX16]], [[N_VEC23]]
-; CHECK-NEXT: br i1 [[CMP_N24]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK-NEXT: [[CMP_N25:%.*]] = icmp eq i64 [[SMAX16]], [[N_VEC24]]
+; CHECK-NEXT: br i1 [[CMP_N25]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
; CHECK: vec.epilog.scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC23]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC24]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ]
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -vector-library=LIBMVEC-X86 -inject-tli-mappings -loop-vectorize -S < %s | FileCheck %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
define void @sin_f64(double* nocapture %varray) {
; CHECK-LABEL: @sin_f64(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP2:%.*]] = sitofp <2 x i32> [[VEC_IND]] to <2 x double>
-; CHECK-NEXT: [[TMP3:%.*]] = sitofp <2 x i32> [[STEP_ADD]] to <2 x double>
-; CHECK-NEXT: [[TMP4:%.*]] = call <2 x double> @_ZGVbN2v_sin(<2 x double> [[TMP2]])
-; CHECK-NEXT: [[TMP5:%.*]] = call <2 x double> @_ZGVbN2v_sin(<2 x double> [[TMP3]])
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds double, double* [[TMP6]], i32 0
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast double* [[TMP8]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP9]], align 4
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds double, double* [[TMP6]], i32 2
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast double* [[TMP10]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP5]], <2 x double>* [[TMP11]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], <i32 2, i32 2>
-; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT: [[CALL:%.*]] = tail call double @sin(double [[CONV]]) #[[ATTR2:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT: store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK-LABEL: vector.body
+; CHECK: [[TMP5:%.*]] = call <2 x double> @_ZGVbN2v_sin(<2 x double> [[TMP4:%.*]])
;
entry:
br label %for.body
define void @sin_f32(float* nocapture %varray) {
; CHECK-LABEL: @sin_f32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sitofp <8 x i32> [[VEC_IND]] to <8 x float>
-; CHECK-NEXT: [[TMP2:%.*]] = call <8 x float> @_ZGVdN8v_sinf(<8 x float> [[TMP1]])
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <8 x float>*
-; CHECK-NEXT: store <8 x float> [[TMP2]], <8 x float>* [[TMP5]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT: [[CALL:%.*]] = tail call float @sinf(float [[CONV]]) #[[ATTR3:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT: store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK-LABEL: vector.body
+; CHECK: [[TMP5:%.*]] = call <8 x float> @_ZGVdN8v_sinf(<8 x float> [[TMP4:%.*]])
;
entry:
br label %for.body
define void @sin_f64_intrinsic(double* nocapture %varray) {
; CHECK-LABEL: @sin_f64_intrinsic(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP2:%.*]] = sitofp <2 x i32> [[VEC_IND]] to <2 x double>
-; CHECK-NEXT: [[TMP3:%.*]] = sitofp <2 x i32> [[STEP_ADD]] to <2 x double>
-; CHECK-NEXT: [[TMP4:%.*]] = call <2 x double> @_ZGVbN2v_sin(<2 x double> [[TMP2]])
-; CHECK-NEXT: [[TMP5:%.*]] = call <2 x double> @_ZGVbN2v_sin(<2 x double> [[TMP3]])
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds double, double* [[TMP6]], i32 0
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast double* [[TMP8]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP9]], align 4
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds double, double* [[TMP6]], i32 2
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast double* [[TMP10]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP5]], <2 x double>* [[TMP11]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], <i32 2, i32 2>
-; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT: [[CALL:%.*]] = tail call double @llvm.sin.f64(double [[CONV]]) #[[ATTR4:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT: store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK-LABEL: vector.body
+; CHECK: [[TMP5:%.*]] = call <2 x double> @_ZGVbN2v_sin(<2 x double> [[TMP4:%.*]])
;
entry:
br label %for.body
define void @sin_f32_intrinsic(float* nocapture %varray) {
; CHECK-LABEL: @sin_f32_intrinsic(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sitofp <8 x i32> [[VEC_IND]] to <8 x float>
-; CHECK-NEXT: [[TMP2:%.*]] = call <8 x float> @_ZGVdN8v_sinf(<8 x float> [[TMP1]])
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <8 x float>*
-; CHECK-NEXT: store <8 x float> [[TMP2]], <8 x float>* [[TMP5]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT: [[CALL:%.*]] = tail call float @llvm.sin.f32(float [[CONV]]) #[[ATTR5:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT: store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK-LABEL: vector.body
+; CHECK: [[TMP5:%.*]] = call <8 x float> @_ZGVdN8v_sinf(<8 x float> [[TMP4:%.*]])
;
entry:
br label %for.body
define void @cos_f64(double* nocapture %varray) {
; CHECK-LABEL: @cos_f64(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP2:%.*]] = sitofp <2 x i32> [[VEC_IND]] to <2 x double>
-; CHECK-NEXT: [[TMP3:%.*]] = sitofp <2 x i32> [[STEP_ADD]] to <2 x double>
-; CHECK-NEXT: [[TMP4:%.*]] = call <2 x double> @_ZGVbN2v_cos(<2 x double> [[TMP2]])
-; CHECK-NEXT: [[TMP5:%.*]] = call <2 x double> @_ZGVbN2v_cos(<2 x double> [[TMP3]])
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds double, double* [[TMP6]], i32 0
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast double* [[TMP8]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP9]], align 4
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds double, double* [[TMP6]], i32 2
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast double* [[TMP10]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP5]], <2 x double>* [[TMP11]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], <i32 2, i32 2>
-; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT: [[CALL:%.*]] = tail call double @cos(double [[CONV]]) #[[ATTR6:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT: store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK-LABEL: vector.body
+; CHECK: [[TMP5:%.*]] = call <2 x double> @_ZGVbN2v_cos(<2 x double> [[TMP4:%.*]])
;
entry:
br label %for.body
define void @cos_f32(float* nocapture %varray) {
; CHECK-LABEL: @cos_f32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sitofp <8 x i32> [[VEC_IND]] to <8 x float>
-; CHECK-NEXT: [[TMP2:%.*]] = call <8 x float> @_ZGVdN8v_cosf(<8 x float> [[TMP1]])
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <8 x float>*
-; CHECK-NEXT: store <8 x float> [[TMP2]], <8 x float>* [[TMP5]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT: [[CALL:%.*]] = tail call float @cosf(float [[CONV]]) #[[ATTR7:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT: store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK-LABEL: vector.body
+; CHECK: [[TMP5:%.*]] = call <8 x float> @_ZGVdN8v_cosf(<8 x float> [[TMP4:%.*]])
;
entry:
br label %for.body
define void @cos_f64_intrinsic(double* nocapture %varray) {
; CHECK-LABEL: @cos_f64_intrinsic(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP2:%.*]] = sitofp <2 x i32> [[VEC_IND]] to <2 x double>
-; CHECK-NEXT: [[TMP3:%.*]] = sitofp <2 x i32> [[STEP_ADD]] to <2 x double>
-; CHECK-NEXT: [[TMP4:%.*]] = call <2 x double> @_ZGVbN2v_cos(<2 x double> [[TMP2]])
-; CHECK-NEXT: [[TMP5:%.*]] = call <2 x double> @_ZGVbN2v_cos(<2 x double> [[TMP3]])
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds double, double* [[TMP6]], i32 0
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast double* [[TMP8]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP9]], align 4
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds double, double* [[TMP6]], i32 2
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast double* [[TMP10]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP5]], <2 x double>* [[TMP11]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], <i32 2, i32 2>
-; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT: [[CALL:%.*]] = tail call double @llvm.cos.f64(double [[CONV]]) #[[ATTR8:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT: store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK-LABEL: vector.body
+; CHECK: [[TMP5:%.*]] = call <2 x double> @_ZGVbN2v_cos(<2 x double> [[TMP4:%.*]])
;
entry:
br label %for.body
define void @cos_f32_intrinsic(float* nocapture %varray) {
; CHECK-LABEL: @cos_f32_intrinsic(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sitofp <8 x i32> [[VEC_IND]] to <8 x float>
-; CHECK-NEXT: [[TMP2:%.*]] = call <8 x float> @_ZGVdN8v_cosf(<8 x float> [[TMP1]])
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <8 x float>*
-; CHECK-NEXT: store <8 x float> [[TMP2]], <8 x float>* [[TMP5]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT: [[CALL:%.*]] = tail call float @llvm.cos.f32(float [[CONV]]) #[[ATTR9:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT: store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK-LABEL: vector.body
+; CHECK: [[TMP5:%.*]] = call <8 x float> @_ZGVdN8v_cosf(<8 x float> [[TMP4:%.*]])
;
entry:
br label %for.body
define void @exp_f32(float* nocapture %varray) {
-; CHECK-LABEL: @exp_f32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sitofp <8 x i32> [[VEC_IND]] to <8 x float>
-; CHECK-NEXT: [[TMP2:%.*]] = call fast <8 x float> @_ZGVdN8v_expf(<8 x float> [[TMP1]])
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <8 x float>*
-; CHECK-NEXT: store <8 x float> [[TMP2]], <8 x float>* [[TMP5]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT: [[CALL:%.*]] = tail call fast float @expf(float [[CONV]]) #[[ATTR10:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK-LABEL: @exp_f32
+; CHECK-LABEL: vector.body
+; CHECK: <8 x float> @_ZGVdN8v_expf
entry:
br label %for.body
!93 = !{!"llvm.loop.vectorize.enable", i1 true}
define void @exp_f32_intrin(float* nocapture %varray) {
-; CHECK-LABEL: @exp_f32_intrin(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sitofp <8 x i32> [[VEC_IND]] to <8 x float>
-; CHECK-NEXT: [[TMP2:%.*]] = call fast <8 x float> @_ZGVdN8v_expf(<8 x float> [[TMP1]])
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <8 x float>*
-; CHECK-NEXT: store <8 x float> [[TMP2]], <8 x float>* [[TMP5]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT: [[CALL:%.*]] = tail call fast float @llvm.exp.f32(float [[CONV]]) #[[ATTR11:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK-LABEL: @exp_f32_intrin
+; CHECK-LABEL: vector.body
+; CHECK: <8 x float> @_ZGVdN8v_expf
entry:
br label %for.body
define void @log_f32(float* nocapture %varray) {
-; CHECK-LABEL: @log_f32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sitofp <8 x i32> [[VEC_IND]] to <8 x float>
-; CHECK-NEXT: [[TMP2:%.*]] = call fast <8 x float> @_ZGVdN8v_logf(<8 x float> [[TMP1]])
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <8 x float>*
-; CHECK-NEXT: store <8 x float> [[TMP2]], <8 x float>* [[TMP5]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT: [[CALL:%.*]] = tail call fast float @logf(float [[CONV]]) #[[ATTR12:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK-LABEL: @log_f32
+; CHECK-LABEL: vector.body
+; CHECK: <8 x float> @_ZGVdN8v_logf
entry:
br label %for.body
!113 = !{!"llvm.loop.vectorize.enable", i1 true}
define void @pow_f32(float* nocapture %varray, float* nocapture readonly %exp) {
-; CHECK-LABEL: @pow_f32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[VARRAY1:%.*]] = bitcast float* [[VARRAY:%.*]] to i8*
-; CHECK-NEXT: [[EXP3:%.*]] = bitcast float* [[EXP:%.*]] to i8*
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK: vector.memcheck:
-; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[VARRAY]], i64 1000
-; CHECK-NEXT: [[SCEVGEP2:%.*]] = bitcast float* [[SCEVGEP]] to i8*
-; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr float, float* [[EXP]], i64 1000
-; CHECK-NEXT: [[SCEVGEP45:%.*]] = bitcast float* [[SCEVGEP4]] to i8*
-; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[VARRAY1]], [[SCEVGEP45]]
-; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[EXP3]], [[SCEVGEP2]]
-; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sitofp <8 x i32> [[VEC_IND]] to <8 x float>
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[EXP]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[TMP3]] to <8 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP4]], align 4, !alias.scope !24
-; CHECK-NEXT: [[TMP5:%.*]] = call fast <8 x float> @_ZGVdN8vv_powf(<8 x float> [[TMP1]], <8 x float> [[WIDE_LOAD]])
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 0
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP7]] to <8 x float>*
-; CHECK-NEXT: store <8 x float> [[TMP5]], <8 x float>* [[TMP8]], align 4, !alias.scope !27, !noalias !24
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
-; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[EXP]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @powf(float [[CONV]], float [[TMP1]]) #[[ATTR13:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store float [[TMP2]], float* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK-LABEL: @pow_f32
+; CHECK-LABEL: vector.body
+; CHECK: <8 x float> @_ZGVdN8vv_powf
entry:
br label %for.body
!123 = !{!"llvm.loop.vectorize.enable", i1 true}
define void @pow_f32_intrin(float* nocapture %varray, float* nocapture readonly %exp) {
-; CHECK-LABEL: @pow_f32_intrin(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[VARRAY1:%.*]] = bitcast float* [[VARRAY:%.*]] to i8*
-; CHECK-NEXT: [[EXP3:%.*]] = bitcast float* [[EXP:%.*]] to i8*
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK: vector.memcheck:
-; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[VARRAY]], i64 1000
-; CHECK-NEXT: [[SCEVGEP2:%.*]] = bitcast float* [[SCEVGEP]] to i8*
-; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr float, float* [[EXP]], i64 1000
-; CHECK-NEXT: [[SCEVGEP45:%.*]] = bitcast float* [[SCEVGEP4]] to i8*
-; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[VARRAY1]], [[SCEVGEP45]]
-; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[EXP3]], [[SCEVGEP2]]
-; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sitofp <8 x i32> [[VEC_IND]] to <8 x float>
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[EXP]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[TMP3]] to <8 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, <8 x float>* [[TMP4]], align 4, !alias.scope !31
-; CHECK-NEXT: [[TMP5:%.*]] = call fast <8 x float> @_ZGVdN8vv_powf(<8 x float> [[TMP1]], <8 x float> [[WIDE_LOAD]])
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 0
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP7]] to <8 x float>*
-; CHECK-NEXT: store <8 x float> [[TMP5]], <8 x float>* [[TMP8]], align 4, !alias.scope !34, !noalias !31
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
-; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[EXP]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.pow.f32(float [[CONV]], float [[TMP1]]) #[[ATTR14:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store float [[TMP2]], float* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK-LABEL: @pow_f32_intrin
+; CHECK-LABEL: vector.body
+; CHECK: <8 x float> @_ZGVdN8vv_powf
entry:
br label %for.body
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -vector-library=LIBMVEC-X86 -inject-tli-mappings -loop-vectorize -S < %s | FileCheck %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
define void @exp_f32(float* nocapture %varray) {
-; CHECK-LABEL: @exp_f32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP2:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT: [[TMP3:%.*]] = sitofp <4 x i32> [[STEP_ADD]] to <4 x float>
-; CHECK-NEXT: [[TMP4:%.*]] = call fast <4 x float> @_ZGVbN4v___expf_finite(<4 x float> [[TMP2]])
-; CHECK-NEXT: [[TMP5:%.*]] = call fast <4 x float> @_ZGVbN4v___expf_finite(<4 x float> [[TMP3]])
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 0
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* [[TMP9]], align 4
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 4
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP5]], <4 x float>* [[TMP11]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT: [[CALL:%.*]] = tail call fast float @__expf_finite(float [[CONV]]) #[[ATTR0:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK-LABEL: @exp_f32
+; CHECK-LABEL: vector.body
+; CHECK: <4 x float> @_ZGVbN4v___expf_finite
+; CHECK: ret
entry:
br label %for.body
!3 = !{!"llvm.loop.vectorize.enable", i1 true}
define void @exp_f64(double* nocapture %varray) {
-; CHECK-LABEL: @exp_f64(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double>
-; CHECK-NEXT: [[TMP2:%.*]] = call fast <4 x double> @_ZGVdN4v___exp_finite(<4 x double> [[TMP1]])
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP4]] to <4 x double>*
-; CHECK-NEXT: store <4 x double> [[TMP2]], <4 x double>* [[TMP5]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT: [[CALL:%.*]] = tail call fast double @__exp_finite(double [[CONV]]) #[[ATTR1:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK-LABEL: @exp_f64
+; CHECK-LABEL: vector.body
+; CHECK: <4 x double> @_ZGVdN4v___exp_finite
+; CHECK: ret
entry:
br label %for.body
!13 = !{!"llvm.loop.vectorize.enable", i1 true}
define void @log_f32(float* nocapture %varray) {
-; CHECK-LABEL: @log_f32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP2:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT: [[TMP3:%.*]] = sitofp <4 x i32> [[STEP_ADD]] to <4 x float>
-; CHECK-NEXT: [[TMP4:%.*]] = call fast <4 x float> @_ZGVbN4v___logf_finite(<4 x float> [[TMP2]])
-; CHECK-NEXT: [[TMP5:%.*]] = call fast <4 x float> @_ZGVbN4v___logf_finite(<4 x float> [[TMP3]])
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 0
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* [[TMP9]], align 4
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 4
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP5]], <4 x float>* [[TMP11]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT: [[CALL:%.*]] = tail call fast float @__logf_finite(float [[CONV]]) #[[ATTR2:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK-LABEL: @log_f32
+; CHECK-LABEL: vector.body
+; CHECK: <4 x float> @_ZGVbN4v___logf_finite
+; CHECK: ret
entry:
br label %for.body
!23 = !{!"llvm.loop.vectorize.enable", i1 true}
define void @log_f64(double* nocapture %varray) {
-; CHECK-LABEL: @log_f64(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double>
-; CHECK-NEXT: [[TMP2:%.*]] = call fast <4 x double> @_ZGVdN4v___log_finite(<4 x double> [[TMP1]])
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP4]] to <4 x double>*
-; CHECK-NEXT: store <4 x double> [[TMP2]], <4 x double>* [[TMP5]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT: [[CALL:%.*]] = tail call fast double @__log_finite(double [[CONV]]) #[[ATTR3:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK-LABEL: @log_f64
+; CHECK-LABEL: vector.body
+; CHECK: <4 x double> @_ZGVdN4v___log_finite
+; CHECK: ret
entry:
br label %for.body
!33 = !{!"llvm.loop.vectorize.enable", i1 true}
define void @pow_f32(float* nocapture %varray, float* nocapture readonly %exp) {
-; CHECK-LABEL: @pow_f32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[VARRAY1:%.*]] = bitcast float* [[VARRAY:%.*]] to i8*
-; CHECK-NEXT: [[EXP3:%.*]] = bitcast float* [[EXP:%.*]] to i8*
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK: vector.memcheck:
-; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[VARRAY]], i64 1000
-; CHECK-NEXT: [[SCEVGEP2:%.*]] = bitcast float* [[SCEVGEP]] to i8*
-; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr float, float* [[EXP]], i64 1000
-; CHECK-NEXT: [[SCEVGEP45:%.*]] = bitcast float* [[SCEVGEP4]] to i8*
-; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[VARRAY1]], [[SCEVGEP45]]
-; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[EXP3]], [[SCEVGEP2]]
-; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP2:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT: [[TMP3:%.*]] = sitofp <4 x i32> [[STEP_ADD]] to <4 x float>
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[EXP]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[EXP]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4, !alias.scope !10
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 4
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x float>, <4 x float>* [[TMP9]], align 4, !alias.scope !10
-; CHECK-NEXT: [[TMP10:%.*]] = call fast <4 x float> @_ZGVbN4vv___powf_finite(<4 x float> [[TMP2]], <4 x float> [[WIDE_LOAD]])
-; CHECK-NEXT: [[TMP11:%.*]] = call fast <4 x float> @_ZGVbN4vv___powf_finite(<4 x float> [[TMP3]], <4 x float> [[WIDE_LOAD7]])
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 0
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP10]], <4 x float>* [[TMP15]], align 4, !alias.scope !13, !noalias !10
-; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 4
-; CHECK-NEXT: [[TMP17:%.*]] = bitcast float* [[TMP16]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP11]], <4 x float>* [[TMP17]], align 4, !alias.scope !13, !noalias !10
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[EXP]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @__powf_finite(float [[CONV]], float [[TMP1]]) #[[ATTR4:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store float [[TMP2]], float* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK-LABEL: @pow_f32
+; CHECK-LABEL: vector.body
+; CHECK: <4 x float> @_ZGVbN4vv___powf_finite
+; CHECK: ret
entry:
br label %for.body
!43 = !{!"llvm.loop.vectorize.enable", i1 true}
define void @pow_f64(double* nocapture %varray, double* nocapture readonly %exp) {
-; CHECK-LABEL: @pow_f64(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[VARRAY1:%.*]] = bitcast double* [[VARRAY:%.*]] to i8*
-; CHECK-NEXT: [[EXP3:%.*]] = bitcast double* [[EXP:%.*]] to i8*
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK: vector.memcheck:
-; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr double, double* [[VARRAY]], i64 1000
-; CHECK-NEXT: [[SCEVGEP2:%.*]] = bitcast double* [[SCEVGEP]] to i8*
-; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr double, double* [[EXP]], i64 1000
-; CHECK-NEXT: [[SCEVGEP45:%.*]] = bitcast double* [[SCEVGEP4]] to i8*
-; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[VARRAY1]], [[SCEVGEP45]]
-; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[EXP3]], [[SCEVGEP2]]
-; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double>
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, double* [[EXP]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[TMP3]] to <4 x double>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x double>, <4 x double>* [[TMP4]], align 4, !alias.scope !17
-; CHECK-NEXT: [[TMP5:%.*]] = call fast <4 x double> @_ZGVdN4vv___pow_finite(<4 x double> [[TMP1]], <4 x double> [[WIDE_LOAD]])
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds double, double* [[TMP6]], i32 0
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to <4 x double>*
-; CHECK-NEXT: store <4 x double> [[TMP5]], <4 x double>* [[TMP8]], align 4, !alias.scope !20, !noalias !17
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[EXP]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP1:%.*]] = load double, double* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = tail call fast double @__pow_finite(double [[CONV]], double [[TMP1]]) #[[ATTR5:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store double [[TMP2]], double* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK-LABEL: @pow_f64
+; CHECK-LABEL: vector.body
+; CHECK: <4 x double> @_ZGVdN4vv___pow_finite
+; CHECK: ret
entry:
br label %for.body
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -vector-library=LIBMVEC-X86 -inject-tli-mappings -loop-vectorize -S < %s | FileCheck %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
define void @sin_f64(double* nocapture %varray) {
; CHECK-LABEL: @sin_f64(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double>
-; CHECK-NEXT: [[TMP2:%.*]] = call <4 x double> @_ZGVdN4v_sin(<4 x double> [[TMP1]])
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP4]] to <4 x double>*
-; CHECK-NEXT: store <4 x double> [[TMP2]], <4 x double>* [[TMP5]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT: [[CALL:%.*]] = tail call double @sin(double [[CONV]]) #[[ATTR2:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT: store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK-LABEL: vector.body
+; CHECK: [[TMP5:%.*]] = call <4 x double> @_ZGVdN4v_sin(<4 x double> [[TMP4:%.*]])
;
entry:
br label %for.body
define void @sin_f32(float* nocapture %varray) {
; CHECK-LABEL: @sin_f32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP2:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT: [[TMP3:%.*]] = sitofp <4 x i32> [[STEP_ADD]] to <4 x float>
-; CHECK-NEXT: [[TMP4:%.*]] = call <4 x float> @_ZGVbN4v_sinf(<4 x float> [[TMP2]])
-; CHECK-NEXT: [[TMP5:%.*]] = call <4 x float> @_ZGVbN4v_sinf(<4 x float> [[TMP3]])
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 0
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* [[TMP9]], align 4
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 4
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP5]], <4 x float>* [[TMP11]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT: [[CALL:%.*]] = tail call float @sinf(float [[CONV]]) #[[ATTR3:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT: store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK-LABEL: vector.body
+; CHECK: [[TMP5:%.*]] = call <4 x float> @_ZGVbN4v_sinf(<4 x float> [[TMP4:%.*]])
;
entry:
br label %for.body
define void @sin_f64_intrinsic(double* nocapture %varray) {
; CHECK-LABEL: @sin_f64_intrinsic(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double>
-; CHECK-NEXT: [[TMP2:%.*]] = call <4 x double> @_ZGVdN4v_sin(<4 x double> [[TMP1]])
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP4]] to <4 x double>*
-; CHECK-NEXT: store <4 x double> [[TMP2]], <4 x double>* [[TMP5]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT: [[CALL:%.*]] = tail call double @llvm.sin.f64(double [[CONV]]) #[[ATTR4:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT: store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK-LABEL: vector.body
+; CHECK: [[TMP5:%.*]] = call <4 x double> @_ZGVdN4v_sin(<4 x double> [[TMP4:%.*]])
;
entry:
br label %for.body
define void @sin_f32_intrinsic(float* nocapture %varray) {
; CHECK-LABEL: @sin_f32_intrinsic(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP2:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT: [[TMP3:%.*]] = sitofp <4 x i32> [[STEP_ADD]] to <4 x float>
-; CHECK-NEXT: [[TMP4:%.*]] = call <4 x float> @_ZGVbN4v_sinf(<4 x float> [[TMP2]])
-; CHECK-NEXT: [[TMP5:%.*]] = call <4 x float> @_ZGVbN4v_sinf(<4 x float> [[TMP3]])
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 0
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* [[TMP9]], align 4
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 4
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP5]], <4 x float>* [[TMP11]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT: [[CALL:%.*]] = tail call float @llvm.sin.f32(float [[CONV]]) #[[ATTR5:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT: store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK-LABEL: vector.body
+; CHECK: [[TMP5:%.*]] = call <4 x float> @_ZGVbN4v_sinf(<4 x float> [[TMP4:%.*]])
;
entry:
br label %for.body
define void @cos_f64(double* nocapture %varray) {
; CHECK-LABEL: @cos_f64(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double>
-; CHECK-NEXT: [[TMP2:%.*]] = call <4 x double> @_ZGVdN4v_cos(<4 x double> [[TMP1]])
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP4]] to <4 x double>*
-; CHECK-NEXT: store <4 x double> [[TMP2]], <4 x double>* [[TMP5]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT: [[CALL:%.*]] = tail call double @cos(double [[CONV]]) #[[ATTR6:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT: store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK-LABEL: vector.body
+; CHECK: [[TMP5:%.*]] = call <4 x double> @_ZGVdN4v_cos(<4 x double> [[TMP4:%.*]])
;
entry:
br label %for.body
define void @cos_f32(float* nocapture %varray) {
; CHECK-LABEL: @cos_f32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP2:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT: [[TMP3:%.*]] = sitofp <4 x i32> [[STEP_ADD]] to <4 x float>
-; CHECK-NEXT: [[TMP4:%.*]] = call <4 x float> @_ZGVbN4v_cosf(<4 x float> [[TMP2]])
-; CHECK-NEXT: [[TMP5:%.*]] = call <4 x float> @_ZGVbN4v_cosf(<4 x float> [[TMP3]])
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 0
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* [[TMP9]], align 4
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 4
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP5]], <4 x float>* [[TMP11]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT: [[CALL:%.*]] = tail call float @cosf(float [[CONV]]) #[[ATTR7:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT: store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK-LABEL: vector.body
+; CHECK: [[TMP5:%.*]] = call <4 x float> @_ZGVbN4v_cosf(<4 x float> [[TMP4:%.*]])
;
entry:
br label %for.body
define void @cos_f64_intrinsic(double* nocapture %varray) {
; CHECK-LABEL: @cos_f64_intrinsic(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double>
-; CHECK-NEXT: [[TMP2:%.*]] = call <4 x double> @_ZGVdN4v_cos(<4 x double> [[TMP1]])
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP4]] to <4 x double>*
-; CHECK-NEXT: store <4 x double> [[TMP2]], <4 x double>* [[TMP5]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT: [[CALL:%.*]] = tail call double @llvm.cos.f64(double [[CONV]]) #[[ATTR8:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT: store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK-LABEL: vector.body
+; CHECK: [[TMP5:%.*]] = call <4 x double> @_ZGVdN4v_cos(<4 x double> [[TMP4:%.*]])
;
entry:
br label %for.body
define void @cos_f32_intrinsic(float* nocapture %varray) {
; CHECK-LABEL: @cos_f32_intrinsic(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP2:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT: [[TMP3:%.*]] = sitofp <4 x i32> [[STEP_ADD]] to <4 x float>
-; CHECK-NEXT: [[TMP4:%.*]] = call <4 x float> @_ZGVbN4v_cosf(<4 x float> [[TMP2]])
-; CHECK-NEXT: [[TMP5:%.*]] = call <4 x float> @_ZGVbN4v_cosf(<4 x float> [[TMP3]])
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 0
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* [[TMP9]], align 4
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 4
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP5]], <4 x float>* [[TMP11]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT: [[CALL:%.*]] = tail call float @llvm.cos.f32(float [[CONV]]) #[[ATTR9:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT: store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK-LABEL: vector.body
+; CHECK: [[TMP5:%.*]] = call <4 x float> @_ZGVbN4v_cosf(<4 x float> [[TMP4:%.*]])
;
entry:
br label %for.body
define void @exp_f32(float* nocapture %varray) {
-; CHECK-LABEL: @exp_f32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP2:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT: [[TMP3:%.*]] = sitofp <4 x i32> [[STEP_ADD]] to <4 x float>
-; CHECK-NEXT: [[TMP4:%.*]] = call fast <4 x float> @_ZGVbN4v_expf(<4 x float> [[TMP2]])
-; CHECK-NEXT: [[TMP5:%.*]] = call fast <4 x float> @_ZGVbN4v_expf(<4 x float> [[TMP3]])
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 0
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* [[TMP9]], align 4
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 4
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP5]], <4 x float>* [[TMP11]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT: [[CALL:%.*]] = tail call fast float @expf(float [[CONV]]) #[[ATTR10:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK-LABEL: @exp_f32
+; CHECK-LABEL: vector.body
+; CHECK: <4 x float> @_ZGVbN4v_expf
entry:
br label %for.body
!93 = !{!"llvm.loop.vectorize.enable", i1 true}
define void @exp_f32_intrin(float* nocapture %varray) {
-; CHECK-LABEL: @exp_f32_intrin(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP2:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT: [[TMP3:%.*]] = sitofp <4 x i32> [[STEP_ADD]] to <4 x float>
-; CHECK-NEXT: [[TMP4:%.*]] = call fast <4 x float> @_ZGVbN4v_expf(<4 x float> [[TMP2]])
-; CHECK-NEXT: [[TMP5:%.*]] = call fast <4 x float> @_ZGVbN4v_expf(<4 x float> [[TMP3]])
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 0
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* [[TMP9]], align 4
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 4
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP5]], <4 x float>* [[TMP11]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT: [[CALL:%.*]] = tail call fast float @llvm.exp.f32(float [[CONV]]) #[[ATTR11:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK-LABEL: @exp_f32_intrin
+; CHECK-LABEL: vector.body
+; CHECK: <4 x float> @_ZGVbN4v_expf
entry:
br label %for.body
define void @log_f32(float* nocapture %varray) {
-; CHECK-LABEL: @log_f32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP2:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT: [[TMP3:%.*]] = sitofp <4 x i32> [[STEP_ADD]] to <4 x float>
-; CHECK-NEXT: [[TMP4:%.*]] = call fast <4 x float> @_ZGVbN4v_logf(<4 x float> [[TMP2]])
-; CHECK-NEXT: [[TMP5:%.*]] = call fast <4 x float> @_ZGVbN4v_logf(<4 x float> [[TMP3]])
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 0
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* [[TMP9]], align 4
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 4
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP5]], <4 x float>* [[TMP11]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT: [[CALL:%.*]] = tail call fast float @logf(float [[CONV]]) #[[ATTR12:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK-LABEL: @log_f32
+; CHECK-LABEL: vector.body
+; CHECK: <4 x float> @_ZGVbN4v_logf
entry:
br label %for.body
!113 = !{!"llvm.loop.vectorize.enable", i1 true}
define void @pow_f32(float* nocapture %varray, float* nocapture readonly %exp) {
-; CHECK-LABEL: @pow_f32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[VARRAY1:%.*]] = bitcast float* [[VARRAY:%.*]] to i8*
-; CHECK-NEXT: [[EXP3:%.*]] = bitcast float* [[EXP:%.*]] to i8*
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK: vector.memcheck:
-; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[VARRAY]], i64 1000
-; CHECK-NEXT: [[SCEVGEP2:%.*]] = bitcast float* [[SCEVGEP]] to i8*
-; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr float, float* [[EXP]], i64 1000
-; CHECK-NEXT: [[SCEVGEP45:%.*]] = bitcast float* [[SCEVGEP4]] to i8*
-; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[VARRAY1]], [[SCEVGEP45]]
-; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[EXP3]], [[SCEVGEP2]]
-; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP2:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT: [[TMP3:%.*]] = sitofp <4 x i32> [[STEP_ADD]] to <4 x float>
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[EXP]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[EXP]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4, !alias.scope !24
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 4
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x float>, <4 x float>* [[TMP9]], align 4, !alias.scope !24
-; CHECK-NEXT: [[TMP10:%.*]] = call fast <4 x float> @_ZGVbN4vv_powf(<4 x float> [[TMP2]], <4 x float> [[WIDE_LOAD]])
-; CHECK-NEXT: [[TMP11:%.*]] = call fast <4 x float> @_ZGVbN4vv_powf(<4 x float> [[TMP3]], <4 x float> [[WIDE_LOAD7]])
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 0
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP10]], <4 x float>* [[TMP15]], align 4, !alias.scope !27, !noalias !24
-; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 4
-; CHECK-NEXT: [[TMP17:%.*]] = bitcast float* [[TMP16]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP11]], <4 x float>* [[TMP17]], align 4, !alias.scope !27, !noalias !24
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[EXP]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @powf(float [[CONV]], float [[TMP1]]) #[[ATTR13:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store float [[TMP2]], float* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK-LABEL: @pow_f32
+; CHECK-LABEL: vector.body
+; CHECK: <4 x float> @_ZGVbN4vv_powf
entry:
br label %for.body
!123 = !{!"llvm.loop.vectorize.enable", i1 true}
define void @pow_f32_intrin(float* nocapture %varray, float* nocapture readonly %exp) {
-; CHECK-LABEL: @pow_f32_intrin(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[VARRAY1:%.*]] = bitcast float* [[VARRAY:%.*]] to i8*
-; CHECK-NEXT: [[EXP3:%.*]] = bitcast float* [[EXP:%.*]] to i8*
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK: vector.memcheck:
-; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[VARRAY]], i64 1000
-; CHECK-NEXT: [[SCEVGEP2:%.*]] = bitcast float* [[SCEVGEP]] to i8*
-; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr float, float* [[EXP]], i64 1000
-; CHECK-NEXT: [[SCEVGEP45:%.*]] = bitcast float* [[SCEVGEP4]] to i8*
-; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[VARRAY1]], [[SCEVGEP45]]
-; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[EXP3]], [[SCEVGEP2]]
-; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP2:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT: [[TMP3:%.*]] = sitofp <4 x i32> [[STEP_ADD]] to <4 x float>
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[EXP]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, float* [[EXP]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 0
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4, !alias.scope !31
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP4]], i32 4
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x float>, <4 x float>* [[TMP9]], align 4, !alias.scope !31
-; CHECK-NEXT: [[TMP10:%.*]] = call fast <4 x float> @_ZGVbN4vv_powf(<4 x float> [[TMP2]], <4 x float> [[WIDE_LOAD]])
-; CHECK-NEXT: [[TMP11:%.*]] = call fast <4 x float> @_ZGVbN4vv_powf(<4 x float> [[TMP3]], <4 x float> [[WIDE_LOAD7]])
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 0
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP10]], <4 x float>* [[TMP15]], align 4, !alias.scope !34, !noalias !31
-; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 4
-; CHECK-NEXT: [[TMP17:%.*]] = bitcast float* [[TMP16]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP11]], <4 x float>* [[TMP17]], align 4, !alias.scope !34, !noalias !31
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[EXP]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.pow.f32(float [[CONV]], float [[TMP1]]) #[[ATTR14:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store float [[TMP2]], float* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK-LABEL: @pow_f32_intrin
+; CHECK-LABEL: vector.body
+; CHECK: <4 x float> @_ZGVbN4vv_powf
entry:
br label %for.body
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; CHECK: vec.epilog.vector.body:
-; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT4:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX1]], 0
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[TMP8]]
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP9]], i32 0
; CHECK-NEXT: [[TMP11:%.*]] = bitcast i8* [[TMP10]] to <8 x i8>*
-; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, <8 x i8>* [[TMP11]], align 64
+; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i8>, <8 x i8>* [[TMP11]], align 64
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 [[TMP8]]
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, i8* [[TMP12]], i32 0
; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to <8 x i8>*
-; CHECK-NEXT: store <8 x i8> [[WIDE_LOAD3]], <8 x i8>* [[TMP14]], align 64
-; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[OFFSET_IDX]], 8
-; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT4]], 16
+; CHECK-NEXT: store <8 x i8> [[WIDE_LOAD4]], <8 x i8>* [[TMP14]], align 64
+; CHECK-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 8
+; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 16
; CHECK-NEXT: br i1 [[TMP15]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
; CHECK: vec.epilog.middle.block:
-; CHECK-NEXT: [[CMP_N1:%.*]] = icmp eq i64 17, 16
-; CHECK-NEXT: br i1 [[CMP_N1]], label [[EXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK-NEXT: [[CMP_N3:%.*]] = icmp eq i64 17, 16
+; CHECK-NEXT: br i1 [[CMP_N3]], label [[EXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
; CHECK: vec.epilog.scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
; CHECK-NEXT: br label [[LOOP_MEMCPY_EXPANSION:%.*]]
; AVX512-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
; AVX512-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; AVX512: vec.epilog.vector.body:
-; AVX512-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT22:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; AVX512-NEXT: [[TMP49:%.*]] = add i64 [[OFFSET_IDX]], 0
+; AVX512-NEXT: [[INDEX19:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT22:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; AVX512-NEXT: [[TMP49:%.*]] = add i64 [[INDEX19]], 0
; AVX512-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP49]]
; AVX512-NEXT: [[TMP51:%.*]] = getelementptr inbounds i32, i32* [[TMP50]], i32 0
; AVX512-NEXT: [[TMP52:%.*]] = bitcast i32* [[TMP51]] to <8 x i32>*
; AVX512-NEXT: [[TMP59:%.*]] = getelementptr i32, i32* [[TMP58]], i32 0
; AVX512-NEXT: [[TMP60:%.*]] = bitcast i32* [[TMP59]] to <8 x i32>*
; AVX512-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP57]], <8 x i32>* [[TMP60]], i32 4, <8 x i1> [[TMP53]])
-; AVX512-NEXT: [[INDEX_NEXT22]] = add nuw i64 [[OFFSET_IDX]], 8
+; AVX512-NEXT: [[INDEX_NEXT22]] = add nuw i64 [[INDEX19]], 8
; AVX512-NEXT: [[TMP61:%.*]] = icmp eq i64 [[INDEX_NEXT22]], 10000
; AVX512-NEXT: br i1 [[TMP61]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; AVX512: vec.epilog.middle.block:
; AVX512-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
; AVX512-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; AVX512: vec.epilog.vector.body:
-; AVX512-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT22:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; AVX512-NEXT: [[TMP49:%.*]] = add i64 [[OFFSET_IDX]], 0
+; AVX512-NEXT: [[INDEX19:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT22:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; AVX512-NEXT: [[TMP49:%.*]] = add i64 [[INDEX19]], 0
; AVX512-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[TMP49]]
; AVX512-NEXT: [[TMP51:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TMP50]], i32 0
; AVX512-NEXT: [[TMP52:%.*]] = bitcast i32 addrspace(1)* [[TMP51]] to <8 x i32> addrspace(1)*
; AVX512-NEXT: [[TMP59:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP58]], i32 0
; AVX512-NEXT: [[TMP60:%.*]] = bitcast i32 addrspace(1)* [[TMP59]] to <8 x i32> addrspace(1)*
; AVX512-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP57]], <8 x i32> addrspace(1)* [[TMP60]], i32 4, <8 x i1> [[TMP53]])
-; AVX512-NEXT: [[INDEX_NEXT22]] = add nuw i64 [[OFFSET_IDX]], 8
+; AVX512-NEXT: [[INDEX_NEXT22]] = add nuw i64 [[INDEX19]], 8
; AVX512-NEXT: [[TMP61:%.*]] = icmp eq i64 [[INDEX_NEXT22]], 10000
; AVX512-NEXT: br i1 [[TMP61]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
; AVX512: vec.epilog.middle.block:
; AVX512-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 9984, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
; AVX512-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; AVX512: vec.epilog.vector.body:
-; AVX512-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT22:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; AVX512-NEXT: [[TMP53:%.*]] = add i64 [[OFFSET_IDX]], 0
+; AVX512-NEXT: [[INDEX19:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT22:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; AVX512-NEXT: [[TMP53:%.*]] = add i64 [[INDEX19]], 0
; AVX512-NEXT: [[TMP54:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[TMP53]]
; AVX512-NEXT: [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TMP54]], i32 0
; AVX512-NEXT: [[TMP56:%.*]] = bitcast i32* [[TMP55]] to <8 x i32>*
; AVX512-NEXT: [[TMP64:%.*]] = getelementptr float, float* [[TMP63]], i32 0
; AVX512-NEXT: [[TMP65:%.*]] = bitcast float* [[TMP64]] to <8 x float>*
; AVX512-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP62]], <8 x float>* [[TMP65]], i32 4, <8 x i1> [[TMP57]])
-; AVX512-NEXT: [[INDEX_NEXT22]] = add nuw i64 [[OFFSET_IDX]], 8
+; AVX512-NEXT: [[INDEX_NEXT22]] = add nuw i64 [[INDEX19]], 8
; AVX512-NEXT: [[TMP66:%.*]] = icmp eq i64 [[INDEX_NEXT22]], 10000
; AVX512-NEXT: br i1 [[TMP66]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP33:![0-9]+]]
; AVX512: vec.epilog.middle.block:
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; REQUIRES: asserts
; RUN: opt < %s -S -debug -loop-vectorize -mcpu=slm 2>&1 | FileCheck %s --check-prefix=SLM
target triple = "x86_64-unknown-linux-gnu"
define i8 @mul_i8(i8* %dataA, i8* %dataB, i32 %N) {
-; SLM-LABEL: @mul_i8(
-; SLM-NEXT: entry:
-; SLM-NEXT: [[CMP12:%.*]] = icmp eq i32 [[N:%.*]], 0
-; SLM-NEXT: br i1 [[CMP12]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
-; SLM: for.body.preheader:
-; SLM-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; SLM-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 8
-; SLM-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; SLM: vector.ph:
-; SLM-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8
-; SLM-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; SLM-NEXT: br label [[VECTOR_BODY:%.*]]
-; SLM: vector.body:
-; SLM-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SLM-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP50:%.*]], [[VECTOR_BODY]] ]
-; SLM-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP51:%.*]], [[VECTOR_BODY]] ]
-; SLM-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; SLM-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; SLM-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[DATAA:%.*]], i64 [[TMP0]]
-; SLM-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[DATAA]], i64 [[TMP1]]
-; SLM-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0
-; SLM-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <4 x i8>*
-; SLM-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP5]], align 1
-; SLM-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 4
-; SLM-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>*
-; SLM-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, <4 x i8>* [[TMP7]], align 1
-; SLM-NEXT: [[TMP8:%.*]] = sext <4 x i8> [[WIDE_LOAD]] to <4 x i32>
-; SLM-NEXT: [[TMP9:%.*]] = sext <4 x i8> [[WIDE_LOAD2]] to <4 x i32>
-; SLM-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[DATAB:%.*]], i64 [[TMP0]]
-; SLM-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[DATAB]], i64 [[TMP1]]
-; SLM-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP10]], i32 0
-; SLM-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <4 x i8>*
-; SLM-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i8>, <4 x i8>* [[TMP13]], align 1
-; SLM-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP10]], i32 4
-; SLM-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <4 x i8>*
-; SLM-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i8>, <4 x i8>* [[TMP15]], align 1
-; SLM-NEXT: [[TMP16:%.*]] = sext <4 x i8> [[WIDE_LOAD3]] to <4 x i32>
-; SLM-NEXT: [[TMP17:%.*]] = sext <4 x i8> [[WIDE_LOAD4]] to <4 x i32>
-; SLM-NEXT: [[TMP18:%.*]] = mul nsw <4 x i32> [[TMP16]], [[TMP8]]
-; SLM-NEXT: [[TMP19:%.*]] = mul nsw <4 x i32> [[TMP17]], [[TMP9]]
-; SLM-NEXT: [[TMP20:%.*]] = zext <4 x i8> [[WIDE_LOAD3]] to <4 x i32>
-; SLM-NEXT: [[TMP21:%.*]] = zext <4 x i8> [[WIDE_LOAD4]] to <4 x i32>
-; SLM-NEXT: [[TMP22:%.*]] = mul nsw <4 x i32> [[TMP20]], [[TMP8]]
-; SLM-NEXT: [[TMP23:%.*]] = mul nsw <4 x i32> [[TMP21]], [[TMP9]]
-; SLM-NEXT: [[TMP24:%.*]] = add <4 x i32> [[TMP18]], [[TMP22]]
-; SLM-NEXT: [[TMP25:%.*]] = add <4 x i32> [[TMP19]], [[TMP23]]
-; SLM-NEXT: [[TMP26:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i32>
-; SLM-NEXT: [[TMP27:%.*]] = zext <4 x i8> [[WIDE_LOAD2]] to <4 x i32>
-; SLM-NEXT: [[TMP28:%.*]] = mul nsw <4 x i32> [[TMP26]], [[TMP20]]
-; SLM-NEXT: [[TMP29:%.*]] = mul nsw <4 x i32> [[TMP27]], [[TMP21]]
-; SLM-NEXT: [[TMP30:%.*]] = add <4 x i32> [[TMP24]], [[TMP28]]
-; SLM-NEXT: [[TMP31:%.*]] = add <4 x i32> [[TMP25]], [[TMP29]]
-; SLM-NEXT: [[TMP32:%.*]] = mul nsw <4 x i32> <i32 -120, i32 -120, i32 -120, i32 -120>, [[TMP16]]
-; SLM-NEXT: [[TMP33:%.*]] = mul nsw <4 x i32> <i32 -120, i32 -120, i32 -120, i32 -120>, [[TMP17]]
-; SLM-NEXT: [[TMP34:%.*]] = add <4 x i32> [[TMP30]], [[TMP32]]
-; SLM-NEXT: [[TMP35:%.*]] = add <4 x i32> [[TMP31]], [[TMP33]]
-; SLM-NEXT: [[TMP36:%.*]] = mul nsw <4 x i32> <i32 250, i32 250, i32 250, i32 250>, [[TMP16]]
-; SLM-NEXT: [[TMP37:%.*]] = mul nsw <4 x i32> <i32 250, i32 250, i32 250, i32 250>, [[TMP17]]
-; SLM-NEXT: [[TMP38:%.*]] = add <4 x i32> [[TMP34]], [[TMP36]]
-; SLM-NEXT: [[TMP39:%.*]] = add <4 x i32> [[TMP35]], [[TMP37]]
-; SLM-NEXT: [[TMP40:%.*]] = mul nsw <4 x i32> <i32 -120, i32 -120, i32 -120, i32 -120>, [[TMP20]]
-; SLM-NEXT: [[TMP41:%.*]] = mul nsw <4 x i32> <i32 -120, i32 -120, i32 -120, i32 -120>, [[TMP21]]
-; SLM-NEXT: [[TMP42:%.*]] = add <4 x i32> [[TMP38]], [[TMP40]]
-; SLM-NEXT: [[TMP43:%.*]] = add <4 x i32> [[TMP39]], [[TMP41]]
-; SLM-NEXT: [[TMP44:%.*]] = mul nsw <4 x i32> <i32 250, i32 250, i32 250, i32 250>, [[TMP20]]
-; SLM-NEXT: [[TMP45:%.*]] = mul nsw <4 x i32> <i32 250, i32 250, i32 250, i32 250>, [[TMP21]]
-; SLM-NEXT: [[TMP46:%.*]] = add <4 x i32> [[TMP42]], [[TMP44]]
-; SLM-NEXT: [[TMP47:%.*]] = add <4 x i32> [[TMP43]], [[TMP45]]
-; SLM-NEXT: [[TMP48:%.*]] = add <4 x i32> [[VEC_PHI]], <i32 5, i32 5, i32 5, i32 5>
-; SLM-NEXT: [[TMP49:%.*]] = add <4 x i32> [[VEC_PHI1]], <i32 5, i32 5, i32 5, i32 5>
-; SLM-NEXT: [[TMP50]] = add <4 x i32> [[TMP48]], [[TMP46]]
-; SLM-NEXT: [[TMP51]] = add <4 x i32> [[TMP49]], [[TMP47]]
-; SLM-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; SLM-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SLM-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; SLM: middle.block:
-; SLM-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP51]], [[TMP50]]
-; SLM-NEXT: [[TMP53:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
-; SLM-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; SLM-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; SLM: scalar.ph:
-; SLM-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; SLM-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP53]], [[MIDDLE_BLOCK]] ]
-; SLM-NEXT: br label [[FOR_BODY:%.*]]
-; SLM: for.cond.cleanup.loopexit:
-; SLM-NEXT: [[ADD4_LCSSA:%.*]] = phi i32 [ [[ADD4:%.*]], [[FOR_BODY]] ], [ [[TMP53]], [[MIDDLE_BLOCK]] ]
-; SLM-NEXT: [[PHITMP:%.*]] = trunc i32 [[ADD4_LCSSA]] to i8
-; SLM-NEXT: br label [[FOR_COND_CLEANUP]]
-; SLM: for.cond.cleanup:
-; SLM-NEXT: [[ACC_0_LCSSA:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[PHITMP]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; SLM-NEXT: ret i8 [[ACC_0_LCSSA]]
-; SLM: for.body:
-; SLM-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; SLM-NEXT: [[ACC_013:%.*]] = phi i32 [ [[ADD4]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; SLM-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[DATAA]], i64 [[INDVARS_IV]]
-; SLM-NEXT: [[TMP54:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; SLM-NEXT: [[CONV:%.*]] = sext i8 [[TMP54]] to i32
-; SLM-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, i8* [[DATAB]], i64 [[INDVARS_IV]]
-; SLM-NEXT: [[TMP55:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1
-; SLM-NEXT: [[CONV3:%.*]] = sext i8 [[TMP55]] to i32
-; SLM-NEXT: [[MUL:%.*]] = mul nsw i32 [[CONV3]], [[CONV]]
-; SLM-NEXT: [[CONV4:%.*]] = zext i8 [[TMP55]] to i32
-; SLM-NEXT: [[MUL2:%.*]] = mul nsw i32 [[CONV4]], [[CONV]]
-; SLM-NEXT: [[SUM0:%.*]] = add i32 [[MUL]], [[MUL2]]
-; SLM-NEXT: [[CONV5:%.*]] = zext i8 [[TMP54]] to i32
-; SLM-NEXT: [[MUL3:%.*]] = mul nsw i32 [[CONV5]], [[CONV4]]
-; SLM-NEXT: [[SUM1:%.*]] = add i32 [[SUM0]], [[MUL3]]
-; SLM-NEXT: [[MUL4:%.*]] = mul nsw i32 -120, [[CONV3]]
-; SLM-NEXT: [[SUM2:%.*]] = add i32 [[SUM1]], [[MUL4]]
-; SLM-NEXT: [[MUL5:%.*]] = mul nsw i32 250, [[CONV3]]
-; SLM-NEXT: [[SUM3:%.*]] = add i32 [[SUM2]], [[MUL5]]
-; SLM-NEXT: [[MUL6:%.*]] = mul nsw i32 -120, [[CONV4]]
-; SLM-NEXT: [[SUM4:%.*]] = add i32 [[SUM3]], [[MUL6]]
-; SLM-NEXT: [[MUL7:%.*]] = mul nsw i32 250, [[CONV4]]
-; SLM-NEXT: [[SUM5:%.*]] = add i32 [[SUM4]], [[MUL7]]
-; SLM-NEXT: [[ADD:%.*]] = add i32 [[ACC_013]], 5
-; SLM-NEXT: [[ADD4]] = add i32 [[ADD]], [[SUM5]]
-; SLM-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; SLM-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; SLM-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-;
entry:
%cmp12 = icmp eq i32 %N, 0
br i1 %cmp12, label %for.cond.cleanup, label %for.body.preheader
%arrayidx2 = getelementptr inbounds i8, i8* %dataB, i64 %indvars.iv
%1 = load i8, i8* %arrayidx2, align 1
%conv3 = sext i8 %1 to i32
-; sources of the mul is sext\sext from i8
-; use pmullw\sext seq.
+; sources of the mul is sext\sext from i8
+; use pmullw\sext seq.
+; SLM: cost of 3 for VF 2 {{.*}} mul nsw i32 %conv3, %conv
%mul = mul nsw i32 %conv3, %conv
; sources of the mul is zext\sext from i8
; use pmulhw\pmullw\pshuf
+; SLM: cost of 2 for VF 2 {{.*}} mul nsw i32 %conv4, %conv
%conv4 = zext i8 %1 to i32
%mul2 = mul nsw i32 %conv4, %conv
%sum0 = add i32 %mul, %mul2
; sources of the mul is zext\zext from i8
; use pmullw\zext
+; SLM: cost of 2 for VF 2 {{.*}} mul nsw i32 %conv5, %conv4
%conv5 = zext i8 %0 to i32
%mul3 = mul nsw i32 %conv5, %conv4
%sum1 = add i32 %sum0, %mul3
; sources of the mul is sext\-120
; use pmullw\sext
+; SLM: cost of 3 for VF 2 {{.*}} mul nsw i32 -120, %conv3
%mul4 = mul nsw i32 -120, %conv3
%sum2 = add i32 %sum1, %mul4
; sources of the mul is sext\250
; use pmulhw\pmullw\pshuf
+; SLM: cost of 2 for VF 2 {{.*}} mul nsw i32 250, %conv3
%mul5 = mul nsw i32 250, %conv3
%sum3 = add i32 %sum2, %mul5
; sources of the mul is zext\-120
; use pmulhw\pmullw\pshuf
+; SLM: cost of 2 for VF 2 {{.*}} mul nsw i32 -120, %conv4
%mul6 = mul nsw i32 -120, %conv4
%sum4 = add i32 %sum3, %mul6
; sources of the mul is zext\250
; use pmullw\zext
+; SLM: cost of 2 for VF 2 {{.*}} mul nsw i32 250, %conv4
%mul7 = mul nsw i32 250, %conv4
%sum5 = add i32 %sum4, %mul7
%add = add i32 %acc.013, 5
}
define i16 @mul_i16(i16* %dataA, i16* %dataB, i32 %N) {
-; SLM-LABEL: @mul_i16(
-; SLM-NEXT: entry:
-; SLM-NEXT: [[CMP12:%.*]] = icmp eq i32 [[N:%.*]], 0
-; SLM-NEXT: br i1 [[CMP12]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
-; SLM: for.body.preheader:
-; SLM-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
-; SLM-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 8
-; SLM-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; SLM: vector.ph:
-; SLM-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8
-; SLM-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
-; SLM-NEXT: br label [[VECTOR_BODY:%.*]]
-; SLM: vector.body:
-; SLM-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SLM-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP50:%.*]], [[VECTOR_BODY]] ]
-; SLM-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP51:%.*]], [[VECTOR_BODY]] ]
-; SLM-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; SLM-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; SLM-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, i16* [[DATAA:%.*]], i64 [[TMP0]]
-; SLM-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, i16* [[DATAA]], i64 [[TMP1]]
-; SLM-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[TMP2]], i32 0
-; SLM-NEXT: [[TMP5:%.*]] = bitcast i16* [[TMP4]] to <4 x i16>*
-; SLM-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP5]], align 1
-; SLM-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP2]], i32 4
-; SLM-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <4 x i16>*
-; SLM-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i16>, <4 x i16>* [[TMP7]], align 1
-; SLM-NEXT: [[TMP8:%.*]] = sext <4 x i16> [[WIDE_LOAD]] to <4 x i32>
-; SLM-NEXT: [[TMP9:%.*]] = sext <4 x i16> [[WIDE_LOAD2]] to <4 x i32>
-; SLM-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, i16* [[DATAB:%.*]], i64 [[TMP0]]
-; SLM-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, i16* [[DATAB]], i64 [[TMP1]]
-; SLM-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, i16* [[TMP10]], i32 0
-; SLM-NEXT: [[TMP13:%.*]] = bitcast i16* [[TMP12]] to <4 x i16>*
-; SLM-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i16>, <4 x i16>* [[TMP13]], align 1
-; SLM-NEXT: [[TMP14:%.*]] = getelementptr inbounds i16, i16* [[TMP10]], i32 4
-; SLM-NEXT: [[TMP15:%.*]] = bitcast i16* [[TMP14]] to <4 x i16>*
-; SLM-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i16>, <4 x i16>* [[TMP15]], align 1
-; SLM-NEXT: [[TMP16:%.*]] = sext <4 x i16> [[WIDE_LOAD3]] to <4 x i32>
-; SLM-NEXT: [[TMP17:%.*]] = sext <4 x i16> [[WIDE_LOAD4]] to <4 x i32>
-; SLM-NEXT: [[TMP18:%.*]] = mul nsw <4 x i32> [[TMP16]], [[TMP8]]
-; SLM-NEXT: [[TMP19:%.*]] = mul nsw <4 x i32> [[TMP17]], [[TMP9]]
-; SLM-NEXT: [[TMP20:%.*]] = zext <4 x i16> [[WIDE_LOAD3]] to <4 x i32>
-; SLM-NEXT: [[TMP21:%.*]] = zext <4 x i16> [[WIDE_LOAD4]] to <4 x i32>
-; SLM-NEXT: [[TMP22:%.*]] = mul nsw <4 x i32> [[TMP20]], [[TMP8]]
-; SLM-NEXT: [[TMP23:%.*]] = mul nsw <4 x i32> [[TMP21]], [[TMP9]]
-; SLM-NEXT: [[TMP24:%.*]] = add <4 x i32> [[TMP18]], [[TMP22]]
-; SLM-NEXT: [[TMP25:%.*]] = add <4 x i32> [[TMP19]], [[TMP23]]
-; SLM-NEXT: [[TMP26:%.*]] = zext <4 x i16> [[WIDE_LOAD]] to <4 x i32>
-; SLM-NEXT: [[TMP27:%.*]] = zext <4 x i16> [[WIDE_LOAD2]] to <4 x i32>
-; SLM-NEXT: [[TMP28:%.*]] = mul nsw <4 x i32> [[TMP26]], [[TMP20]]
-; SLM-NEXT: [[TMP29:%.*]] = mul nsw <4 x i32> [[TMP27]], [[TMP21]]
-; SLM-NEXT: [[TMP30:%.*]] = add <4 x i32> [[TMP24]], [[TMP28]]
-; SLM-NEXT: [[TMP31:%.*]] = add <4 x i32> [[TMP25]], [[TMP29]]
-; SLM-NEXT: [[TMP32:%.*]] = mul nsw <4 x i32> <i32 -32000, i32 -32000, i32 -32000, i32 -32000>, [[TMP16]]
-; SLM-NEXT: [[TMP33:%.*]] = mul nsw <4 x i32> <i32 -32000, i32 -32000, i32 -32000, i32 -32000>, [[TMP17]]
-; SLM-NEXT: [[TMP34:%.*]] = add <4 x i32> [[TMP30]], [[TMP32]]
-; SLM-NEXT: [[TMP35:%.*]] = add <4 x i32> [[TMP31]], [[TMP33]]
-; SLM-NEXT: [[TMP36:%.*]] = mul nsw <4 x i32> <i32 64000, i32 64000, i32 64000, i32 64000>, [[TMP16]]
-; SLM-NEXT: [[TMP37:%.*]] = mul nsw <4 x i32> <i32 64000, i32 64000, i32 64000, i32 64000>, [[TMP17]]
-; SLM-NEXT: [[TMP38:%.*]] = add <4 x i32> [[TMP34]], [[TMP36]]
-; SLM-NEXT: [[TMP39:%.*]] = add <4 x i32> [[TMP35]], [[TMP37]]
-; SLM-NEXT: [[TMP40:%.*]] = mul nsw <4 x i32> <i32 -32000, i32 -32000, i32 -32000, i32 -32000>, [[TMP20]]
-; SLM-NEXT: [[TMP41:%.*]] = mul nsw <4 x i32> <i32 -32000, i32 -32000, i32 -32000, i32 -32000>, [[TMP21]]
-; SLM-NEXT: [[TMP42:%.*]] = add <4 x i32> [[TMP38]], [[TMP40]]
-; SLM-NEXT: [[TMP43:%.*]] = add <4 x i32> [[TMP39]], [[TMP41]]
-; SLM-NEXT: [[TMP44:%.*]] = mul nsw <4 x i32> <i32 250, i32 250, i32 250, i32 250>, [[TMP20]]
-; SLM-NEXT: [[TMP45:%.*]] = mul nsw <4 x i32> <i32 250, i32 250, i32 250, i32 250>, [[TMP21]]
-; SLM-NEXT: [[TMP46:%.*]] = add <4 x i32> [[TMP42]], [[TMP44]]
-; SLM-NEXT: [[TMP47:%.*]] = add <4 x i32> [[TMP43]], [[TMP45]]
-; SLM-NEXT: [[TMP48:%.*]] = add <4 x i32> [[VEC_PHI]], <i32 5, i32 5, i32 5, i32 5>
-; SLM-NEXT: [[TMP49:%.*]] = add <4 x i32> [[VEC_PHI1]], <i32 5, i32 5, i32 5, i32 5>
-; SLM-NEXT: [[TMP50]] = add <4 x i32> [[TMP48]], [[TMP46]]
-; SLM-NEXT: [[TMP51]] = add <4 x i32> [[TMP49]], [[TMP47]]
-; SLM-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; SLM-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SLM-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; SLM: middle.block:
-; SLM-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP51]], [[TMP50]]
-; SLM-NEXT: [[TMP53:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
-; SLM-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
-; SLM-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; SLM: scalar.ph:
-; SLM-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; SLM-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP53]], [[MIDDLE_BLOCK]] ]
-; SLM-NEXT: br label [[FOR_BODY:%.*]]
-; SLM: for.cond.cleanup.loopexit:
-; SLM-NEXT: [[ADD4_LCSSA:%.*]] = phi i32 [ [[ADD4:%.*]], [[FOR_BODY]] ], [ [[TMP53]], [[MIDDLE_BLOCK]] ]
-; SLM-NEXT: [[PHITMP:%.*]] = trunc i32 [[ADD4_LCSSA]] to i16
-; SLM-NEXT: br label [[FOR_COND_CLEANUP]]
-; SLM: for.cond.cleanup:
-; SLM-NEXT: [[ACC_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[PHITMP]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; SLM-NEXT: ret i16 [[ACC_0_LCSSA]]
-; SLM: for.body:
-; SLM-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; SLM-NEXT: [[ACC_013:%.*]] = phi i32 [ [[ADD4]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; SLM-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[DATAA]], i64 [[INDVARS_IV]]
-; SLM-NEXT: [[TMP54:%.*]] = load i16, i16* [[ARRAYIDX]], align 1
-; SLM-NEXT: [[CONV:%.*]] = sext i16 [[TMP54]] to i32
-; SLM-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, i16* [[DATAB]], i64 [[INDVARS_IV]]
-; SLM-NEXT: [[TMP55:%.*]] = load i16, i16* [[ARRAYIDX2]], align 1
-; SLM-NEXT: [[CONV3:%.*]] = sext i16 [[TMP55]] to i32
-; SLM-NEXT: [[MUL:%.*]] = mul nsw i32 [[CONV3]], [[CONV]]
-; SLM-NEXT: [[CONV4:%.*]] = zext i16 [[TMP55]] to i32
-; SLM-NEXT: [[MUL2:%.*]] = mul nsw i32 [[CONV4]], [[CONV]]
-; SLM-NEXT: [[SUM0:%.*]] = add i32 [[MUL]], [[MUL2]]
-; SLM-NEXT: [[CONV5:%.*]] = zext i16 [[TMP54]] to i32
-; SLM-NEXT: [[MUL3:%.*]] = mul nsw i32 [[CONV5]], [[CONV4]]
-; SLM-NEXT: [[SUM1:%.*]] = add i32 [[SUM0]], [[MUL3]]
-; SLM-NEXT: [[MUL4:%.*]] = mul nsw i32 -32000, [[CONV3]]
-; SLM-NEXT: [[SUM2:%.*]] = add i32 [[SUM1]], [[MUL4]]
-; SLM-NEXT: [[MUL5:%.*]] = mul nsw i32 64000, [[CONV3]]
-; SLM-NEXT: [[SUM3:%.*]] = add i32 [[SUM2]], [[MUL5]]
-; SLM-NEXT: [[MUL6:%.*]] = mul nsw i32 -32000, [[CONV4]]
-; SLM-NEXT: [[SUM4:%.*]] = add i32 [[SUM3]], [[MUL6]]
-; SLM-NEXT: [[MUL7:%.*]] = mul nsw i32 250, [[CONV4]]
-; SLM-NEXT: [[SUM5:%.*]] = add i32 [[SUM4]], [[MUL7]]
-; SLM-NEXT: [[ADD:%.*]] = add i32 [[ACC_013]], 5
-; SLM-NEXT: [[ADD4]] = add i32 [[ADD]], [[SUM5]]
-; SLM-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; SLM-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; SLM-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-;
entry:
%cmp12 = icmp eq i32 %N, 0
br i1 %cmp12, label %for.cond.cleanup, label %for.body.preheader
%arrayidx2 = getelementptr inbounds i16, i16* %dataB, i64 %indvars.iv
%1 = load i16, i16* %arrayidx2, align 1
%conv3 = sext i16 %1 to i32
-; sources of the mul is sext\sext from i16
-; use pmulhw\pmullw\pshuf seq.
+; sources of the mul is sext\sext from i16
+; use pmulhw\pmullw\pshuf seq.
+; SLM: cost of 3 for VF 4 {{.*}} mul nsw i32 %conv3, %conv
%mul = mul nsw i32 %conv3, %conv
; sources of the mul is zext\sext from i16
; use pmulld
+; SLM: cost of 2 for VF 4 {{.*}} mul nsw i32 %conv4, %conv
%conv4 = zext i16 %1 to i32
%mul2 = mul nsw i32 %conv4, %conv
%sum0 = add i32 %mul, %mul2
; sources of the mul is zext\zext from i16
; use pmulhw\pmullw\zext
+; SLM: cost of 2 for VF 4 {{.*}} mul nsw i32 %conv5, %conv4
%conv5 = zext i16 %0 to i32
%mul3 = mul nsw i32 %conv5, %conv4
%sum1 = add i32 %sum0, %mul3
; sources of the mul is sext\-32000
; use pmulhw\pmullw\sext
+; SLM: cost of 2 for VF 4 {{.*}} mul nsw i32 -32000, %conv3
%mul4 = mul nsw i32 -32000, %conv3
%sum2 = add i32 %sum1, %mul4
; sources of the mul is sext\64000
; use pmulld
+; SLM: cost of 11 for VF 4 {{.*}} mul nsw i32 64000, %conv3
%mul5 = mul nsw i32 64000, %conv3
%sum3 = add i32 %sum2, %mul5
; sources of the mul is zext\-32000
; use pmulld
+; SLM: cost of 11 for VF 4 {{.*}} mul nsw i32 -32000, %conv4
%mul6 = mul nsw i32 -32000, %conv4
%sum4 = add i32 %sum3, %mul6
; sources of the mul is zext\64000
; use pmulhw\pmullw\zext
+; SLM: cost of 5 for VF 4 {{.*}} mul nsw i32 250, %conv4
%mul7 = mul nsw i32 250, %conv4
%sum5 = add i32 %sum4, %mul7
%add = add i32 %acc.013, 5
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -loop-vectorize -mtriple=x86_64-unknown-linux -S -pass-remarks='loop-vectorize' -pass-remarks-missed='loop-vectorize' -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck %s
; CHECK: remark: no_fpmath.c:6:11: loop not vectorized: cannot prove it is safe to reorder floating-point operations
; Function Attrs: nounwind readonly ssp uwtable
define double @cond_sum(i32* nocapture readonly %v, i32 %n) #0 !dbg !4 {
-; CHECK-LABEL: @cond_sum(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP_7:%.*]] = icmp sgt i32 [[N:%.*]], 0, !dbg [[DBG8:![0-9]+]]
-; CHECK-NEXT: br i1 [[CMP_7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]], !dbg [[DBG9:![0-9]+]]
-; CHECK: for.body.preheader:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]], !dbg [[DBG10:![0-9]+]]
-; CHECK: for.cond.cleanup.loopexit:
-; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi double [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: br label [[FOR_COND_CLEANUP]], !dbg [[DBG11:![0-9]+]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: [[A_0_LCSSA:%.*]] = phi double [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; CHECK-NEXT: ret double [[A_0_LCSSA]], !dbg [[DBG11]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT: [[A_08:%.*]] = phi double [ [[ADD]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[V:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG10]]
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4, !dbg [[DBG10]], !tbaa [[TBAA12:![0-9]+]]
-; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[TMP0]], 0, !dbg [[DBG16:![0-9]+]]
-; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP1]], double 3.400000e+00, double 1.150000e+00, !dbg [[DBG10]]
-; CHECK-NEXT: [[ADD]] = fadd double [[A_08]], [[COND]], !dbg [[DBG17:![0-9]+]]
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1, !dbg [[DBG9]]
-; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32, !dbg [[DBG9]]
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]], !dbg [[DBG9]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !dbg [[DBG9]]
-;
entry:
%cmp.7 = icmp sgt i32 %n, 0, !dbg !3
br i1 %cmp.7, label %for.body.preheader, label %for.cond.cleanup, !dbg !8
; Function Attrs: nounwind readonly ssp uwtable
define double @cond_sum_loop_hint(i32* nocapture readonly %v, i32 %n) #0 !dbg !20 {
-; CHECK-LABEL: @cond_sum_loop_hint(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP_7:%.*]] = icmp sgt i32 [[N:%.*]], 0, !dbg [[DBG19:![0-9]+]]
-; CHECK-NEXT: br i1 [[CMP_7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]], !dbg [[DBG20:![0-9]+]]
-; CHECK: for.body.preheader:
-; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1, !dbg [[DBG21:![0-9]+]]
-; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64, !dbg [[DBG21]]
-; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1, !dbg [[DBG21]]
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4, !dbg [[DBG21]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]], !dbg [[DBG21]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4, !dbg [[DBG21]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]], !dbg [[DBG21]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]], !dbg [[DBG21]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], !dbg [[DBG20]]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x double> [ <double 0.000000e+00, double -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <2 x double> [ <double -0.000000e+00, double -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[V:%.*]], i64 [[TMP3]], !dbg [[DBG21]]
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[V]], i64 [[TMP4]], !dbg [[DBG21]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0, !dbg [[DBG21]]
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <2 x i32>*, !dbg [[DBG21]]
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP8]], align 4, !dbg [[DBG21]], !tbaa [[TBAA12]]
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 2, !dbg [[DBG21]]
-; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <2 x i32>*, !dbg [[DBG21]]
-; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP10]], align 4, !dbg [[DBG21]], !tbaa [[TBAA12]]
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq <2 x i32> [[WIDE_LOAD]], zeroinitializer, !dbg [[DBG22:![0-9]+]]
-; CHECK-NEXT: [[TMP12:%.*]] = icmp eq <2 x i32> [[WIDE_LOAD2]], zeroinitializer, !dbg [[DBG22]]
-; CHECK-NEXT: [[TMP13:%.*]] = select <2 x i1> [[TMP11]], <2 x double> <double 3.400000e+00, double 3.400000e+00>, <2 x double> <double 1.150000e+00, double 1.150000e+00>, !dbg [[DBG21]]
-; CHECK-NEXT: [[TMP14:%.*]] = select <2 x i1> [[TMP12]], <2 x double> <double 3.400000e+00, double 3.400000e+00>, <2 x double> <double 1.150000e+00, double 1.150000e+00>, !dbg [[DBG21]]
-; CHECK-NEXT: [[TMP15]] = fadd <2 x double> [[VEC_PHI]], [[TMP13]], !dbg [[DBG23:![0-9]+]]
-; CHECK-NEXT: [[TMP16]] = fadd <2 x double> [[VEC_PHI1]], [[TMP14]], !dbg [[DBG23]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4, !dbg [[DBG20]]
-; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]], !dbg [[DBG20]]
-; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !dbg [[DBG20]], !llvm.loop [[LOOP24:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd <2 x double> [[TMP16]], [[TMP15]], !dbg [[DBG20]]
-; CHECK-NEXT: [[TMP18:%.*]] = call double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[BIN_RDX]]), !dbg [[DBG20]]
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]], !dbg [[DBG20]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]], !dbg [[DBG20]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]], !dbg [[DBG21]]
-; CHECK: for.cond.cleanup.loopexit:
-; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi double [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR_COND_CLEANUP]], !dbg [[DBG26:![0-9]+]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: [[A_0_LCSSA:%.*]] = phi double [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; CHECK-NEXT: ret double [[A_0_LCSSA]], !dbg [[DBG26]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[A_08:%.*]] = phi double [ [[ADD]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[V]], i64 [[INDVARS_IV]], !dbg [[DBG21]]
-; CHECK-NEXT: [[TMP19:%.*]] = load i32, i32* [[ARRAYIDX]], align 4, !dbg [[DBG21]], !tbaa [[TBAA12]]
-; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[TMP19]], 0, !dbg [[DBG22]]
-; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP1]], double 3.400000e+00, double 1.150000e+00, !dbg [[DBG21]]
-; CHECK-NEXT: [[ADD]] = fadd double [[A_08]], [[COND]], !dbg [[DBG23]]
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1, !dbg [[DBG20]]
-; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32, !dbg [[DBG20]]
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]], !dbg [[DBG20]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !dbg [[DBG20]], !llvm.loop [[LOOP27:![0-9]+]]
-;
entry:
%cmp.7 = icmp sgt i32 %n, 0, !dbg !19
br i1 %cmp.7, label %for.body.preheader, label %for.cond.cleanup, !dbg !21
!26 = distinct !{!26, !27}
!27 = !{!"llvm.loop.vectorize.enable", i1 true}
!28 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang",
- file: !5,
- isOptimized: true, flags: "-O2",
- splitDebugFilename: "abc.debug", emissionKind: 2)
+ file: !5,
+ isOptimized: true, flags: "-O2",
+ splitDebugFilename: "abc.debug", emissionKind: 2)
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -loop-vectorize -mtriple=x86_64-unknown-linux -S -pass-remarks=loop-vectorize -pass-remarks-missed=loop-vectorize -pass-remarks-analysis=loop-vectorize -pass-remarks-with-hotness 2>&1 | FileCheck %s
; RUN: opt < %s -passes=loop-vectorize -mtriple=x86_64-unknown-linux -S -pass-remarks=loop-vectorize -pass-remarks-missed=loop-vectorize -pass-remarks-analysis=loop-vectorize -pass-remarks-with-hotness 2>&1 | FileCheck %s
; Function Attrs: nounwind readonly ssp uwtable
define double @cond_sum(i32* nocapture readonly %v, i32 %n) #0 !dbg !4 !prof !29 {
-; CHECK-LABEL: @cond_sum(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP_7:%.*]] = icmp sgt i32 [[N:%.*]], 0, !dbg [[DBG9:![0-9]+]]
-; CHECK-NEXT: br i1 [[CMP_7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]], !dbg [[DBG10:![0-9]+]], !prof [[PROF11:![0-9]+]]
-; CHECK: for.body.preheader:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]], !dbg [[DBG12:![0-9]+]]
-; CHECK: for.cond.cleanup.loopexit:
-; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi double [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: br label [[FOR_COND_CLEANUP]], !dbg [[DBG13:![0-9]+]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: [[A_0_LCSSA:%.*]] = phi double [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; CHECK-NEXT: ret double [[A_0_LCSSA]], !dbg [[DBG13]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT: [[A_08:%.*]] = phi double [ [[ADD]], [[FOR_BODY]] ], [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[V:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG12]]
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4, !dbg [[DBG12]], !tbaa [[TBAA14:![0-9]+]]
-; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[TMP0]], 0, !dbg [[DBG18:![0-9]+]]
-; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP1]], double 3.400000e+00, double 1.150000e+00, !dbg [[DBG12]]
-; CHECK-NEXT: [[ADD]] = fadd double [[A_08]], [[COND]], !dbg [[DBG19:![0-9]+]]
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1, !dbg [[DBG10]]
-; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32, !dbg [[DBG10]]
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]], !dbg [[DBG10]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !dbg [[DBG10]], !prof [[PROF20:![0-9]+]], !llvm.loop [[LOOP21:![0-9]+]]
-;
entry:
%cmp.7 = icmp sgt i32 %n, 0, !dbg !3
br i1 %cmp.7, label %for.body.preheader, label %for.cond.cleanup, !dbg !8, !prof !30
; Function Attrs: nounwind readonly ssp uwtable
define double @cond_sum_loop_hint(i32* nocapture readonly %v, i32 %n) #0 !dbg !20 !prof !29{
-; CHECK-LABEL: @cond_sum_loop_hint(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP_7:%.*]] = icmp sgt i32 [[N:%.*]], 0, !dbg [[DBG24:![0-9]+]]
-; CHECK-NEXT: br i1 [[CMP_7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]], !dbg [[DBG25:![0-9]+]], !prof [[PROF11]]
-; CHECK: for.body.preheader:
-; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1, !dbg [[DBG26:![0-9]+]]
-; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64, !dbg [[DBG26]]
-; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1, !dbg [[DBG26]]
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 2, !dbg [[DBG26]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]], !dbg [[DBG26]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 2, !dbg [[DBG26]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]], !dbg [[DBG26]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]], !dbg [[DBG26]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], !dbg [[DBG25]]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x double> [ <double 0.000000e+00, double -0.000000e+00>, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[V:%.*]], i64 [[TMP3]], !dbg [[DBG26]]
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0, !dbg [[DBG26]]
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <2 x i32>*, !dbg [[DBG26]]
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP6]], align 4, !dbg [[DBG26]], !tbaa [[TBAA14]]
-; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <2 x i32> [[WIDE_LOAD]], zeroinitializer, !dbg [[DBG27:![0-9]+]]
-; CHECK-NEXT: [[TMP8:%.*]] = select <2 x i1> [[TMP7]], <2 x double> <double 3.400000e+00, double 3.400000e+00>, <2 x double> <double 1.150000e+00, double 1.150000e+00>, !dbg [[DBG26]]
-; CHECK-NEXT: [[TMP9]] = fadd <2 x double> [[VEC_PHI]], [[TMP8]], !dbg [[DBG28:![0-9]+]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2, !dbg [[DBG25]]
-; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]], !dbg [[DBG25]]
-; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !dbg [[DBG25]], !prof [[PROF29:![0-9]+]], !llvm.loop [[LOOP30:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[TMP11:%.*]] = call double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[TMP9]]), !dbg [[DBG25]]
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]], !dbg [[DBG25]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]], !dbg [[DBG25]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[FOR_BODY_PREHEADER]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]], !dbg [[DBG26]]
-; CHECK: for.cond.cleanup.loopexit:
-; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi double [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR_COND_CLEANUP]], !dbg [[DBG32:![0-9]+]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: [[A_0_LCSSA:%.*]] = phi double [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; CHECK-NEXT: ret double [[A_0_LCSSA]], !dbg [[DBG32]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[A_08:%.*]] = phi double [ [[ADD]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[V]], i64 [[INDVARS_IV]], !dbg [[DBG26]]
-; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4, !dbg [[DBG26]], !tbaa [[TBAA14]]
-; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[TMP12]], 0, !dbg [[DBG27]]
-; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP1]], double 3.400000e+00, double 1.150000e+00, !dbg [[DBG26]]
-; CHECK-NEXT: [[ADD]] = fadd double [[A_08]], [[COND]], !dbg [[DBG28]]
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1, !dbg [[DBG25]]
-; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32, !dbg [[DBG25]]
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]], !dbg [[DBG25]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !dbg [[DBG25]], !prof [[PROF33:![0-9]+]], !llvm.loop [[LOOP34:![0-9]+]]
-;
entry:
%cmp.7 = icmp sgt i32 %n, 0, !dbg !19
br i1 %cmp.7, label %for.body.preheader, label %for.cond.cleanup, !dbg !21, !prof !30
!26 = distinct !{!26, !27, !18}
!27 = !{!"llvm.loop.vectorize.enable", i1 true}
!28 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang",
- file: !5,
- isOptimized: true, flags: "-O2",
- splitDebugFilename: "abc.debug", emissionKind: 2)
+ file: !5,
+ isOptimized: true, flags: "-O2",
+ splitDebugFilename: "abc.debug", emissionKind: 2)
!29 = !{!"function_entry_count", i64 3}
!30 = !{!"branch_weights", i32 99, i32 1}
!31 = !{!"branch_weights", i32 1, i32 99}
; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <64 x i32> poison, i32 [[INDEX]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <64 x i32> [[BROADCAST_SPLATINSERT]], <64 x i32> poison, <64 x i32> zeroinitializer
-; CHECK-NEXT: [[VEC_IV:%.*]] = add <64 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <64 x i32> [[VEC_IV]], <i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202>
+; CHECK-NEXT: [[INDUCTION:%.*]] = add <64 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <64 x i32> [[INDUCTION]], <i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202>
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]]
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0
; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>*
; CHECK-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
; CHECK-NEXT: [[INC]] = add nsw i32 [[I_08]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 202
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP2:!llvm.loop !.*]]
; CHECK: for.end:
; CHECK-NEXT: ret i32 0
;
; AUTOVF-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
; AUTOVF-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <32 x i32> poison, i32 [[INDEX]], i32 0
; AUTOVF-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <32 x i32> [[BROADCAST_SPLATINSERT]], <32 x i32> poison, <32 x i32> zeroinitializer
-; AUTOVF-NEXT: [[VEC_IV:%.*]] = add <32 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AUTOVF-NEXT: [[TMP1:%.*]] = icmp ule <32 x i32> [[VEC_IV]], <i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202>
+; AUTOVF-NEXT: [[INDUCTION:%.*]] = add <32 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AUTOVF-NEXT: [[TMP1:%.*]] = icmp ule <32 x i32> [[INDUCTION]], <i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202>
; AUTOVF-NEXT: [[TMP2:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]]
; AUTOVF-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0
; AUTOVF-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <32 x i8>*
; AUTOVF-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
; AUTOVF-NEXT: [[INC]] = add nsw i32 [[I_08]], 1
; AUTOVF-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 202
-; AUTOVF-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; AUTOVF-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP2:!llvm.loop !.*]]
; AUTOVF: for.end:
; AUTOVF-NEXT: ret i32 0
;
; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <64 x i32> poison, i32 [[INDEX]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <64 x i32> [[BROADCAST_SPLATINSERT]], <64 x i32> poison, <64 x i32> zeroinitializer
-; CHECK-NEXT: [[VEC_IV:%.*]] = add <64 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
-; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <64 x i32> [[VEC_IV]], <i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202>
+; CHECK-NEXT: [[INDUCTION:%.*]] = add <64 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
+; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <64 x i32> [[INDUCTION]], <i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202>
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]]
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0
; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <64 x i8>*
; CHECK-NEXT: call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> [[TMP6]], <64 x i8>* [[TMP7]], i32 1, <64 x i1> [[TMP1]])
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 64
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
-; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
; CHECK-NEXT: [[INC]] = add nsw i32 [[I_08]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 202
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP5:!llvm.loop !.*]]
; CHECK: for.end:
; CHECK-NEXT: ret i32 0
;
; AUTOVF-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
; AUTOVF-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <32 x i32> poison, i32 [[INDEX]], i32 0
; AUTOVF-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <32 x i32> [[BROADCAST_SPLATINSERT]], <32 x i32> poison, <32 x i32> zeroinitializer
-; AUTOVF-NEXT: [[VEC_IV:%.*]] = add <32 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; AUTOVF-NEXT: [[TMP1:%.*]] = icmp ule <32 x i32> [[VEC_IV]], <i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202>
+; AUTOVF-NEXT: [[INDUCTION:%.*]] = add <32 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AUTOVF-NEXT: [[TMP1:%.*]] = icmp ule <32 x i32> [[INDUCTION]], <i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202, i32 202>
; AUTOVF-NEXT: [[TMP2:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]]
; AUTOVF-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0
; AUTOVF-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <32 x i8>*
; AUTOVF-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
; AUTOVF-NEXT: [[INC]] = add nsw i32 [[I_08]], 1
; AUTOVF-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], 202
-; AUTOVF-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; AUTOVF-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP5:!llvm.loop !.*]]
; AUTOVF: for.end:
; AUTOVF-NEXT: ret i32 0
;
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <64 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = mul nsw <64 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], <64 x i32> [[TMP1]]
-; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <64 x i32> @llvm.masked.gather.v64i32.v64p0i32(<64 x i32*> [[TMP2]], i32 4, <64 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <64 x i32> undef)
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <64 x i32>*
-; CHECK-NEXT: store <64 x i32> [[WIDE_MASKED_GATHER]], <64 x i32>* [[TMP5]], align 4
+; CHECK-NEXT: [[TMP64:%.*]] = mul nsw <64 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], <64 x i32> [[TMP64]]
+; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <64 x i32> @llvm.masked.gather.v64i32.v64p0i32(<64 x i32*> [[TMP65]], i32 4, <64 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <64 x i32> undef)
+; CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]]
+; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i32, i32* [[TMP66]], i32 0
+; CHECK-NEXT: [[TMP68:%.*]] = bitcast i32* [[TMP67]] to <64 x i32>*
+; CHECK-NEXT: store <64 x i32> [[WIDE_MASKED_GATHER]], <64 x i32>* [[TMP68]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 64
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <64 x i32> [[VEC_IND]], <i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64>
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT: [[TMP69:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
+; CHECK-NEXT: br i1 [[TMP69]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 256, 256
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; CHECK-NEXT: [[I_07:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[I_07]], [[K]]
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[MUL]]
-; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT: [[TMP70:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[I_07]]
-; CHECK-NEXT: store i32 [[TMP7]], i32* [[ARRAYIDX1]], align 4
+; CHECK-NEXT: store i32 [[TMP70]], i32* [[ARRAYIDX1]], align 4
; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_07]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 256
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], [[LOOP7:!llvm.loop !.*]]
; CHECK: for.end.loopexit:
; CHECK-NEXT: ret void
;
; AUTOVF-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; AUTOVF-NEXT: [[VEC_IND:%.*]] = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; AUTOVF-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; AUTOVF-NEXT: [[TMP1:%.*]] = mul nsw <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; AUTOVF-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], <8 x i32> [[TMP1]]
-; AUTOVF-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP2]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
-; AUTOVF-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]]
-; AUTOVF-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0
-; AUTOVF-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>*
-; AUTOVF-NEXT: store <8 x i32> [[WIDE_MASKED_GATHER]], <8 x i32>* [[TMP5]], align 4
+; AUTOVF-NEXT: [[TMP8:%.*]] = mul nsw <8 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; AUTOVF-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], <8 x i32> [[TMP8]]
+; AUTOVF-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP9]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+; AUTOVF-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]]
+; AUTOVF-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0
+; AUTOVF-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <8 x i32>*
+; AUTOVF-NEXT: store <8 x i32> [[WIDE_MASKED_GATHER]], <8 x i32>* [[TMP12]], align 4
; AUTOVF-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
; AUTOVF-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
-; AUTOVF-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
-; AUTOVF-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; AUTOVF-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
+; AUTOVF-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]]
; AUTOVF: middle.block:
; AUTOVF-NEXT: [[CMP_N:%.*]] = icmp eq i32 256, 256
; AUTOVF-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; AUTOVF-NEXT: [[I_07:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
; AUTOVF-NEXT: [[MUL:%.*]] = mul nsw i32 [[I_07]], [[K]]
; AUTOVF-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[MUL]]
-; AUTOVF-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; AUTOVF-NEXT: [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
; AUTOVF-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[I_07]]
-; AUTOVF-NEXT: store i32 [[TMP7]], i32* [[ARRAYIDX1]], align 4
+; AUTOVF-NEXT: store i32 [[TMP14]], i32* [[ARRAYIDX1]], align 4
; AUTOVF-NEXT: [[INC]] = add nuw nsw i32 [[I_07]], 1
; AUTOVF-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 256
-; AUTOVF-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; AUTOVF-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], [[LOOP7:!llvm.loop !.*]]
; AUTOVF: for.end.loopexit:
; AUTOVF-NEXT: ret void
;
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
;CHECK-NOT: <4 x i32>
define void @parallel_loop(i32* nocapture %a, i32* nocapture %b) nounwind uwtable {
-; CHECK-LABEL: @parallel_loop(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[INDVARS_IV_NEXT_REG2MEM:%.*]] = alloca i64, align 8
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[STOREMERGE:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT_RELOAD2:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE:%.*]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[STOREMERGE]]
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4, !llvm.access.group !0
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[STOREMERGE]]
-; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4, !llvm.access.group !0
-; CHECK-NEXT: [[IDXPROM3:%.*]] = sext i32 [[TMP1]] to i64
-; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IDXPROM3]]
-; CHECK-NEXT: store i32 [[TMP0]], i32* [[ARRAYIDX4]], align 4, !llvm.access.group !0
-; CHECK-NEXT: [[INDVARS_IV_NEXT:%.*]] = add i64 [[STOREMERGE]], 1
-; CHECK-NEXT: store i64 [[INDVARS_IV_NEXT]], i64* [[INDVARS_IV_NEXT_REG2MEM]], align 8
-; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV_NEXT]]
-; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4, !llvm.access.group !0
-; CHECK-NEXT: store i32 [[TMP2]], i32* [[ARRAYIDX2]], align 4, !llvm.access.group !0
-; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], 512
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY_FOR_BODY_CRIT_EDGE]], !llvm.loop [[LOOP1:![0-9]+]]
-; CHECK: for.body.for.body_crit_edge:
-; CHECK-NEXT: [[INDVARS_IV_NEXT_RELOAD2]] = load i64, i64* [[INDVARS_IV_NEXT_REG2MEM]], align 8
-; CHECK-NEXT: br label [[FOR_BODY]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
entry:
%indvars.iv.next.reg2mem = alloca i64
%indvars.iv.reg2mem = alloca i64
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; REQUIRES: asserts
; RUN: opt -runtime-memory-check-threshold=9 -passes='loop-vectorize' -mtriple=x86_64-unknown-linux -S -debug %s 2>&1 | FileCheck %s
; TODO: should not be vectorized.
define void @test(double* nocapture %A, double* nocapture %B, double* nocapture %C, double* nocapture %D, double* nocapture %E) {
; CHECK-LABEL: @test(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[A1:%.*]] = bitcast double* [[A:%.*]] to i8*
-; CHECK-NEXT: [[B3:%.*]] = bitcast double* [[B:%.*]] to i8*
-; CHECK-NEXT: [[E6:%.*]] = bitcast double* [[E:%.*]] to i8*
-; CHECK-NEXT: [[C9:%.*]] = bitcast double* [[C:%.*]] to i8*
-; CHECK-NEXT: [[D12:%.*]] = bitcast double* [[D:%.*]] to i8*
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK: vector.memcheck:
-; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr double, double* [[A]], i64 16
-; CHECK-NEXT: [[SCEVGEP2:%.*]] = bitcast double* [[SCEVGEP]] to i8*
-; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr double, double* [[B]], i64 16
-; CHECK-NEXT: [[SCEVGEP45:%.*]] = bitcast double* [[SCEVGEP4]] to i8*
-; CHECK-NEXT: [[SCEVGEP7:%.*]] = getelementptr double, double* [[E]], i64 16
-; CHECK-NEXT: [[SCEVGEP78:%.*]] = bitcast double* [[SCEVGEP7]] to i8*
-; CHECK-NEXT: [[SCEVGEP10:%.*]] = getelementptr double, double* [[C]], i64 16
-; CHECK-NEXT: [[SCEVGEP1011:%.*]] = bitcast double* [[SCEVGEP10]] to i8*
-; CHECK-NEXT: [[SCEVGEP13:%.*]] = getelementptr double, double* [[D]], i64 16
-; CHECK-NEXT: [[SCEVGEP1314:%.*]] = bitcast double* [[SCEVGEP13]] to i8*
-; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP45]]
-; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[B3]], [[SCEVGEP2]]
-; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT: [[BOUND015:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP78]]
-; CHECK-NEXT: [[BOUND116:%.*]] = icmp ult i8* [[E6]], [[SCEVGEP2]]
-; CHECK-NEXT: [[FOUND_CONFLICT17:%.*]] = and i1 [[BOUND015]], [[BOUND116]]
-; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT17]]
-; CHECK-NEXT: [[BOUND018:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP1011]]
-; CHECK-NEXT: [[BOUND119:%.*]] = icmp ult i8* [[C9]], [[SCEVGEP2]]
-; CHECK-NEXT: [[FOUND_CONFLICT20:%.*]] = and i1 [[BOUND018]], [[BOUND119]]
-; CHECK-NEXT: [[CONFLICT_RDX21:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT20]]
-; CHECK-NEXT: [[BOUND022:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP1314]]
-; CHECK-NEXT: [[BOUND123:%.*]] = icmp ult i8* [[D12]], [[SCEVGEP2]]
-; CHECK-NEXT: [[FOUND_CONFLICT24:%.*]] = and i1 [[BOUND022]], [[BOUND123]]
-; CHECK-NEXT: [[CONFLICT_RDX25:%.*]] = or i1 [[CONFLICT_RDX21]], [[FOUND_CONFLICT24]]
-; CHECK-NEXT: [[BOUND026:%.*]] = icmp ult i8* [[B3]], [[SCEVGEP78]]
-; CHECK-NEXT: [[BOUND127:%.*]] = icmp ult i8* [[E6]], [[SCEVGEP45]]
-; CHECK-NEXT: [[FOUND_CONFLICT28:%.*]] = and i1 [[BOUND026]], [[BOUND127]]
-; CHECK-NEXT: [[CONFLICT_RDX29:%.*]] = or i1 [[CONFLICT_RDX25]], [[FOUND_CONFLICT28]]
-; CHECK-NEXT: [[BOUND030:%.*]] = icmp ult i8* [[B3]], [[SCEVGEP1011]]
-; CHECK-NEXT: [[BOUND131:%.*]] = icmp ult i8* [[C9]], [[SCEVGEP45]]
-; CHECK-NEXT: [[FOUND_CONFLICT32:%.*]] = and i1 [[BOUND030]], [[BOUND131]]
-; CHECK-NEXT: [[CONFLICT_RDX33:%.*]] = or i1 [[CONFLICT_RDX29]], [[FOUND_CONFLICT32]]
-; CHECK-NEXT: [[BOUND034:%.*]] = icmp ult i8* [[B3]], [[SCEVGEP1314]]
-; CHECK-NEXT: [[BOUND135:%.*]] = icmp ult i8* [[D12]], [[SCEVGEP45]]
-; CHECK-NEXT: [[FOUND_CONFLICT36:%.*]] = and i1 [[BOUND034]], [[BOUND135]]
-; CHECK-NEXT: [[CONFLICT_RDX37:%.*]] = or i1 [[CONFLICT_RDX33]], [[FOUND_CONFLICT36]]
-; CHECK-NEXT: [[BOUND038:%.*]] = icmp ult i8* [[E6]], [[SCEVGEP1011]]
-; CHECK-NEXT: [[BOUND139:%.*]] = icmp ult i8* [[C9]], [[SCEVGEP78]]
-; CHECK-NEXT: [[FOUND_CONFLICT40:%.*]] = and i1 [[BOUND038]], [[BOUND139]]
-; CHECK-NEXT: [[CONFLICT_RDX41:%.*]] = or i1 [[CONFLICT_RDX37]], [[FOUND_CONFLICT40]]
-; CHECK-NEXT: [[BOUND042:%.*]] = icmp ult i8* [[E6]], [[SCEVGEP1314]]
-; CHECK-NEXT: [[BOUND143:%.*]] = icmp ult i8* [[D12]], [[SCEVGEP78]]
-; CHECK-NEXT: [[FOUND_CONFLICT44:%.*]] = and i1 [[BOUND042]], [[BOUND143]]
-; CHECK-NEXT: [[CONFLICT_RDX45:%.*]] = or i1 [[CONFLICT_RDX41]], [[FOUND_CONFLICT44]]
-; CHECK-NEXT: br i1 [[CONFLICT_RDX45]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, double* [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast double* [[TMP2]] to <2 x double>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP3]], align 4, !alias.scope !0, !noalias !3
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[TMP2]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> zeroinitializer, <2 x double>* [[TMP4]], align 4, !alias.scope !0, !noalias !3
-; CHECK-NEXT: [[TMP5:%.*]] = call <2 x double> @llvm.pow.v2f64(<2 x double> [[WIDE_LOAD]], <2 x double> <double 2.000000e+00, double 2.000000e+00>)
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds double, double* [[TMP6]], i32 0
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to <2 x double>*
-; CHECK-NEXT: [[WIDE_LOAD46:%.*]] = load <2 x double>, <2 x double>* [[TMP8]], align 4, !alias.scope !8, !noalias !9
-; CHECK-NEXT: [[TMP9:%.*]] = call <2 x double> @llvm.pow.v2f64(<2 x double> [[WIDE_LOAD46]], <2 x double> [[TMP5]])
-; CHECK-NEXT: [[TMP10:%.*]] = bitcast double* [[TMP7]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> zeroinitializer, <2 x double>* [[TMP10]], align 4, !alias.scope !8, !noalias !9
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, double* [[C]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, double* [[TMP11]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast double* [[TMP12]] to <2 x double>*
-; CHECK-NEXT: [[WIDE_LOAD47:%.*]] = load <2 x double>, <2 x double>* [[TMP13]], align 4, !alias.scope !10
-; CHECK-NEXT: [[TMP14:%.*]] = call <2 x double> @llvm.pow.v2f64(<2 x double> [[TMP5]], <2 x double> [[WIDE_LOAD47]])
-; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds double, double* [[D]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds double, double* [[TMP15]], i32 0
-; CHECK-NEXT: [[TMP17:%.*]] = bitcast double* [[TMP16]] to <2 x double>*
-; CHECK-NEXT: [[WIDE_LOAD48:%.*]] = load <2 x double>, <2 x double>* [[TMP17]], align 8, !alias.scope !11
-; CHECK-NEXT: [[TMP18:%.*]] = call <2 x double> @llvm.pow.v2f64(<2 x double> [[TMP14]], <2 x double> [[WIDE_LOAD48]])
-; CHECK-NEXT: [[TMP19:%.*]] = call <2 x double> @llvm.pow.v2f64(<2 x double> [[TMP18]], <2 x double> [[TMP14]])
-; CHECK-NEXT: [[TMP20:%.*]] = fmul <2 x double> <double 2.000000e+00, double 2.000000e+00>, [[TMP19]]
-; CHECK-NEXT: [[TMP21:%.*]] = fmul <2 x double> [[TMP20]], <double 2.000000e+00, double 2.000000e+00>
-; CHECK-NEXT: [[TMP22:%.*]] = fmul <2 x double> [[TMP20]], [[TMP21]]
-; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds double, double* [[E]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds double, double* [[TMP23]], i32 0
-; CHECK-NEXT: [[TMP25:%.*]] = bitcast double* [[TMP24]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP22]], <2 x double>* [[TMP25]], align 4, !alias.scope !12, !noalias !13
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; CHECK-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 16, 16
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[IV]]
-; CHECK-NEXT: [[L_A:%.*]] = load double, double* [[GEP_A]], align 4
-; CHECK-NEXT: store double 0.000000e+00, double* [[GEP_A]], align 4
-; CHECK-NEXT: [[P_1:%.*]] = call double @llvm.pow.f64(double [[L_A]], double 2.000000e+00)
-; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[IV]]
-; CHECK-NEXT: [[L_B:%.*]] = load double, double* [[GEP_B]], align 4
-; CHECK-NEXT: [[P_2:%.*]] = call double @llvm.pow.f64(double [[L_B]], double [[P_1]])
-; CHECK-NEXT: store double 0.000000e+00, double* [[GEP_B]], align 4
-; CHECK-NEXT: [[GEP_C:%.*]] = getelementptr inbounds double, double* [[C]], i64 [[IV]]
-; CHECK-NEXT: [[L_C:%.*]] = load double, double* [[GEP_C]], align 4
-; CHECK-NEXT: [[P_3:%.*]] = call double @llvm.pow.f64(double [[P_1]], double [[L_C]])
-; CHECK-NEXT: [[GEP_D:%.*]] = getelementptr inbounds double, double* [[D]], i64 [[IV]]
-; CHECK-NEXT: [[L_D:%.*]] = load double, double* [[GEP_D]], align 8
-; CHECK-NEXT: [[P_4:%.*]] = call double @llvm.pow.f64(double [[P_3]], double [[L_D]])
-; CHECK-NEXT: [[P_5:%.*]] = call double @llvm.pow.f64(double [[P_4]], double [[P_3]])
-; CHECK-NEXT: [[MUL:%.*]] = fmul double 2.000000e+00, [[P_5]]
-; CHECK-NEXT: [[MUL_2:%.*]] = fmul double [[MUL]], 2.000000e+00
-; CHECK-NEXT: [[MUL_3:%.*]] = fmul double [[MUL]], [[MUL_2]]
-; CHECK-NEXT: [[GEP_E:%.*]] = getelementptr inbounds double, double* [[E]], i64 [[IV]]
-; CHECK-NEXT: store double [[MUL_3]], double* [[GEP_E]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 16
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK: vector.memcheck
+; CHECK: vector.body
;
entry:
br label %for.body
; CHECK-NEXT: store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4, !llvm.access.group !0
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 8
-; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP1:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 8, 8
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
; CHECK-NEXT: store float [[ADD]], float* [[ARRAYIDX2]], align 4, !llvm.access.group !0
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 8
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP3:!llvm.loop !.*]]
; CHECK: for.end:
; CHECK-NEXT: ret void
;
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[DOTPROMOTED]], [[INDEX]]
; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[OFFSET_IDX]], 0
; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[OFFSET_IDX]], 4
-; CHECK-NEXT: [[TMP22:%.*]] = add i32 [[TMP20]], 1
-; CHECK-NEXT: [[TMP23:%.*]] = add i32 [[TMP21]], 1
-; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i32 [[TMP22]]
-; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i32 [[TMP23]]
-; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP24]], i32 0
-; CHECK-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
-; CHECK-NEXT: store <4 x i32> zeroinitializer, <4 x i32>* [[TMP27]], align 4
-; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP24]], i32 4
-; CHECK-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
-; CHECK-NEXT: store <4 x i32> zeroinitializer, <4 x i32>* [[TMP29]], align 4
+; CHECK-NEXT: [[TMP25:%.*]] = add i32 [[TMP20]], 1
+; CHECK-NEXT: [[TMP26:%.*]] = add i32 [[TMP21]], 1
+; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i32 [[TMP25]]
+; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i32 [[TMP26]]
+; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[TMP29]], i32 0
+; CHECK-NEXT: [[TMP32:%.*]] = bitcast i32* [[TMP31]] to <4 x i32>*
+; CHECK-NEXT: store <4 x i32> zeroinitializer, <4 x i32>* [[TMP32]], align 4
+; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[TMP29]], i32 4
+; CHECK-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <4 x i32>*
+; CHECK-NEXT: store <4 x i32> zeroinitializer, <4 x i32>* [[TMP34]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
-; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: [[TMP37:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP6]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND4_FOR_INC9_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s | FileCheck %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
define void @foo(i64* %ptr, i32* %ptr.2) {
; CHECK-LABEL: @foo(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[PTR_21:%.*]] = bitcast i32* [[PTR_2:%.*]] to i8*
-; CHECK-NEXT: [[PTR3:%.*]] = bitcast i64* [[PTR:%.*]] to i8*
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK: vector.memcheck:
-; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[PTR_2]], i64 1
-; CHECK-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
-; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i64, i64* [[PTR]], i64 80
-; CHECK-NEXT: [[SCEVGEP45:%.*]] = bitcast i64* [[SCEVGEP4]] to i8*
-; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[PTR_21]], [[SCEVGEP45]]
-; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[PTR3]], [[SCEVGEP2]]
-; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY:%.*]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 2, i64 3, i64 4, i64 5>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 2, [[INDEX]]
-; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[OFFSET_IDX]] to i32
-; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], 0
-; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP0]], 1
-; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], 2
-; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP0]], 3
-; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: store i32 [[TMP1]], i32* [[PTR_2]], align 4, !alias.scope !0, !noalias !3
-; CHECK-NEXT: store i32 [[TMP2]], i32* [[PTR_2]], align 4, !alias.scope !0, !noalias !3
-; CHECK-NEXT: store i32 [[TMP3]], i32* [[PTR_2]], align 4, !alias.scope !0, !noalias !3
-; CHECK-NEXT: store i32 [[TMP4]], i32* [[PTR_2]], align 4, !alias.scope !0, !noalias !3
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, i64* [[PTR]], i64 [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, i64* [[TMP6]], i32 0
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast i64* [[TMP7]] to <4 x i64>*
-; CHECK-NEXT: store <4 x i64> [[VEC_IND]], <4 x i64>* [[TMP8]], align 8, !alias.scope !3
+; CHECK-NEXT: [[TRUNC:%.+]] = trunc i64 [[OFFSET_IDX]] to i32
+; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TRUNC]], 0
+; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TRUNC]], 1
+; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TRUNC]], 2
+; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TRUNC]], 3
+; CHECK-NEXT: = add i64 [[INDEX]], 0
+; CHECK-NEXT: store i32 [[TMP7]], i32* %ptr.2, align 4
+; CHECK-NEXT: store i32 [[TMP8]], i32* %ptr.2, align 4
+; CHECK-NEXT: store i32 [[TMP9]], i32* %ptr.2, align 4
+; CHECK-NEXT: store i32 [[TMP10]], i32* %ptr.2, align 4
+; CHECK: store <4 x i64> [[VEC_IND]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 80
-; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 80, 80
-; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 80, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i64 [ 82, [[MIDDLE_BLOCK]] ], [ 2, [[ENTRY]] ], [ 2, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT: br label [[LOOP:%.*]]
-; CHECK: loop:
-; CHECK-NEXT: [[CAN_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[CAN_IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[TMP10:%.*]] = phi i64 [ [[BC_RESUME_VAL6]], [[SCALAR_PH]] ], [ [[TMP13:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[TMP11:%.*]] = and i64 [[TMP10]], 4294967295
-; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP10]] to i32
-; CHECK-NEXT: store i32 [[TMP12]], i32* [[PTR_2]], align 4
-; CHECK-NEXT: [[GEP_PTR:%.*]] = getelementptr inbounds i64, i64* [[PTR]], i64 [[CAN_IV]]
-; CHECK-NEXT: store i64 [[TMP10]], i64* [[GEP_PTR]], align 8
-; CHECK-NEXT: [[TMP13]] = add nuw nsw i64 [[TMP11]], 1
-; CHECK-NEXT: [[TMP14:%.*]] = icmp sgt i32 [[TMP12]], 80
-; CHECK-NEXT: [[CAN_IV_NEXT]] = add nuw nsw i64 [[CAN_IV]], 1
-; CHECK-NEXT: br i1 [[TMP14]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK: exit:
-; CHECK-NEXT: ret void
+; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 80
+; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]]
;
entry:
br label %loop
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -loop-vectorize -S < %s 2>&1 | FileCheck %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
; Make sure that we can compile the test without crash.
define void @barney() {
+
; CHECK-LABEL: @barney(
-; CHECK-NEXT: bb:
-; CHECK-NEXT: br label [[BB2:%.*]]
-; CHECK: bb2:
-; CHECK-NEXT: [[TMP4:%.*]] = icmp slt i32 undef, 0
-; CHECK-NEXT: br i1 [[TMP4]], label [[BB2]], label [[BB5:%.*]]
-; CHECK: bb5:
-; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 undef, i64 1)
-; CHECK-NEXT: br label [[BB19:%.*]]
-; CHECK: bb18:
-; CHECK-NEXT: ret void
-; CHECK: bb19:
-; CHECK-NEXT: [[TMP22:%.*]] = phi i32 [ [[TMP65_LCSSA:%.*]], [[BB36:%.*]] ], [ undef, [[BB5]] ]
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 32
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 32
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]]
-; CHECK-NEXT: [[CAST_CRD:%.*]] = trunc i64 [[N_VEC]] to i32
-; CHECK-NEXT: [[TMP0:%.*]] = mul i32 [[CAST_CRD]], 13
-; CHECK-NEXT: [[IND_END:%.*]] = add i32 [[TMP22]], [[TMP0]]
-; CHECK-NEXT: [[IND_END3:%.*]] = add i64 1, [[N_VEC]]
-; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[TMP22]], i32 0
-; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i32> [[DOTSPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT: [[INDUCTION:%.*]] = add <16 x i32> [[DOTSPLAT]], <i32 0, i32 13, i32 26, i32 39, i32 52, i32 65, i32 78, i32 91, i32 104, i32 117, i32 130, i32 143, i32 156, i32 169, i32 182, i32 195>
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <16 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[STEP_ADD:%.*]] = add <16 x i32> [[VEC_IND]], <i32 208, i32 208, i32 208, i32 208, i32 208, i32 208, i32 208, i32 208, i32 208, i32 208, i32 208, i32 208, i32 208, i32 208, i32 208, i32 208>
-; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 1, [[INDEX]]
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 16
-; CHECK-NEXT: [[TMP3:%.*]] = add <16 x i32> [[VEC_IND]], <i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13>
-; CHECK-NEXT: [[TMP4:%.*]] = add <16 x i32> [[STEP_ADD]], <i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13>
-; CHECK-NEXT: [[TMP5:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; CHECK-NEXT: [[TMP6:%.*]] = add nuw nsw i64 [[TMP2]], 1
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <16 x i32> [[STEP_ADD]], <i32 208, i32 208, i32 208, i32 208, i32 208, i32 208, i32 208, i32 208, i32 208, i32 208, i32 208, i32 208, i32 208, i32 208, i32 208, i32 208>
-; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
-; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[N_VEC]], 1
-; CHECK-NEXT: [[CAST_CMO:%.*]] = trunc i64 [[TMP8]] to i32
-; CHECK-NEXT: [[TMP9:%.*]] = mul i32 [[CAST_CMO]], 13
-; CHECK-NEXT: [[IND_ESCAPE:%.*]] = add i32 [[TMP22]], [[TMP9]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[BB46:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[TMP22]], [[BB19]] ]
-; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i64 [ [[IND_END3]], [[MIDDLE_BLOCK]] ], [ 1, [[BB19]] ]
-; CHECK-NEXT: br label [[BB50:%.*]]
-; CHECK: bb33:
-; CHECK-NEXT: [[TMP65_LCSSA]] = phi i32 [ [[TMP65:%.*]], [[BB62:%.*]] ]
-; CHECK-NEXT: br i1 true, label [[BB18:%.*]], label [[BB36]]
-; CHECK: bb36:
-; CHECK-NEXT: br label [[BB19]]
-; CHECK: bb46:
-; CHECK-NEXT: [[TMP52_LCSSA:%.*]] = phi i32 [ [[TMP52:%.*]], [[BB50]] ], [ [[IND_ESCAPE]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: [[TMP55_LCSSA:%.*]] = phi i32 [ [[TMP55:%.*]], [[BB50]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: [[TMP56_LCSSA:%.*]] = phi i64 [ [[TMP56:%.*]], [[BB50]] ], [ [[IND_END3]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br i1 true, label [[BB48:%.*]], label [[BB59:%.*]]
-; CHECK: bb48:
-; CHECK-NEXT: [[TMP52_LCSSA_LCSSA:%.*]] = phi i32 [ [[TMP52_LCSSA]], [[BB46]] ]
-; CHECK-NEXT: [[TMP49:%.*]] = add i32 [[TMP52_LCSSA_LCSSA]], 14
-; CHECK-NEXT: ret void
-; CHECK: bb50:
-; CHECK-NEXT: [[TMP52]] = phi i32 [ [[TMP55]], [[BB50]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[TMP53:%.*]] = phi i64 [ [[TMP56]], [[BB50]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[TMP54:%.*]] = add i32 [[TMP52]], 12
-; CHECK-NEXT: [[TMP55]] = add i32 [[TMP52]], 13
-; CHECK-NEXT: [[TMP56]] = add nuw nsw i64 [[TMP53]], 1
-; CHECK-NEXT: [[TMP58:%.*]] = icmp ult i64 [[TMP53]], undef
-; CHECK-NEXT: br i1 [[TMP58]], label [[BB50]], label [[BB46]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: bb59:
-; CHECK-NEXT: br label [[BB62]]
-; CHECK: bb62:
-; CHECK-NEXT: [[TMP63:%.*]] = phi i32 [ [[TMP65]], [[BB68:%.*]] ], [ [[TMP55_LCSSA]], [[BB59]] ]
-; CHECK-NEXT: [[TMP64:%.*]] = phi i64 [ [[TMP66:%.*]], [[BB68]] ], [ [[TMP56_LCSSA]], [[BB59]] ]
-; CHECK-NEXT: [[TMP65]] = add i32 [[TMP63]], 13
-; CHECK-NEXT: [[TMP66]] = add nuw nsw i64 [[TMP64]], 1
-; CHECK-NEXT: [[TMP67:%.*]] = icmp ult i64 [[TMP66]], 2
-; CHECK-NEXT: br i1 [[TMP67]], label [[BB68]], label [[BB33:%.*]]
-; CHECK: bb68:
-; CHECK-NEXT: br label [[BB62]]
-;
bb:
br label %bb2
}
define i32 @foo(i32 addrspace(1)* %p) {
-; CHECK-LABEL: @foo(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[OUTER:%.*]]
-; CHECK: outer:
-; CHECK-NEXT: [[INDVAR:%.*]] = phi i32 [ [[INDVAR_NEXT:%.*]], [[OUTER_LATCH:%.*]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 2, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[OUTER_LATCH]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDVAR]], 1
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 4
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP0]], 4
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP0]], [[N_MOD_VF]]
-; CHECK-NEXT: [[IND_END:%.*]] = add i32 1, [[N_VEC]]
-; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[N_VEC]], 2
-; CHECK-NEXT: [[IND_END2:%.*]] = add i32 6, [[TMP1]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 6, i32 8, i32 10, i32 12>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP2]] = or <4 x i32> [[VEC_PHI]], [[VEC_IND]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8>
-; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+
+; CHECK-LABEL: foo
; CHECK: middle.block:
-; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP2]])
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[OUTER_LATCH]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 1, [[OUTER]] ]
-; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ 6, [[OUTER]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[OUTER]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[INNER:%.*]]
-; CHECK: inner:
-; CHECK-NEXT: [[TMP5:%.*]] = phi i32 [ [[TMP7:%.*]], [[INNER]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[A:%.*]] = phi i32 [ [[TMP8:%.*]], [[INNER]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[B:%.*]] = phi i32 [ [[TMP6:%.*]], [[INNER]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[TMP6]] = add i32 [[B]], 2
-; CHECK-NEXT: [[TMP7]] = or i32 [[TMP5]], [[B]]
-; CHECK-NEXT: [[TMP8]] = add nuw nsw i32 [[A]], 1
-; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
-; CHECK-NEXT: [[TMP10:%.*]] = icmp ugt i64 [[IV]], [[TMP9]]
-; CHECK-NEXT: br i1 [[TMP10]], label [[INNER]], label [[OUTER_LATCH]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK: outer_latch:
-; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi i32 [ [[TMP7]], [[INNER]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: store atomic i32 [[DOTLCSSA]], i32 addrspace(1)* [[P:%.*]] unordered, align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[TMP11:%.*]] = icmp ugt i64 [[IV]], 63
-; CHECK-NEXT: [[INDVAR_NEXT]] = add i32 [[INDVAR]], 1
-; CHECK-NEXT: br i1 [[TMP11]], label [[EXIT:%.*]], label [[OUTER]]
-; CHECK: exit:
-; CHECK-NEXT: ret i32 0
-;
entry:
br label %outer
; CHECK-NEXT: iter.check:
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ITER_CHECK:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <64 x i8> [ zeroinitializer, [[ITER_CHECK]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <64 x i8> [ zeroinitializer, [[ITER_CHECK]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <64 x i8> [ zeroinitializer, [[ENTRY]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <64 x i8> [ zeroinitializer, [[ENTRY]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [128 x i8], [128 x i8]* @bytes, i64 0, i64 [[INDEX]]
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <64 x i8>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <64 x i8>, <64 x i8>* [[TMP1]], align 16
; CHECK-NEXT: [[TMP5]] = add <64 x i8> [[WIDE_LOAD2]], [[VEC_PHI1]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 128
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX]], 0
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
; CHECK: middle.block:
; CHECK-NEXT: [[BIN_RDX:%.*]] = add <64 x i8> [[TMP5]], [[TMP4]]
; CHECK-NEXT: [[TMP7:%.*]] = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> [[BIN_RDX]])
; SSE2-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i16> [[WIDE_VEC2]], <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
; SSE2-NEXT: [[TMP9:%.*]] = sext <4 x i16> [[STRIDED_VEC3]] to <4 x i32>
; SSE2-NEXT: [[TMP10:%.*]] = mul nsw <4 x i32> [[TMP9]], [[TMP5]]
-; SSE2-NEXT: [[TMP11:%.*]] = sext <4 x i16> [[STRIDED_VEC1]] to <4 x i32>
-; SSE2-NEXT: [[TMP12:%.*]] = sext <4 x i16> [[STRIDED_VEC4]] to <4 x i32>
-; SSE2-NEXT: [[TMP13:%.*]] = mul nsw <4 x i32> [[TMP12]], [[TMP11]]
-; SSE2-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], [[TMP10]]
-; SSE2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[D1:%.*]], i64 [[TMP0]]
-; SSE2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP15]], i32 0
-; SSE2-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
-; SSE2-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP17]], align 4
+; SSE2-NEXT: [[TMP13:%.*]] = sext <4 x i16> [[STRIDED_VEC1]] to <4 x i32>
+; SSE2-NEXT: [[TMP15:%.*]] = sext <4 x i16> [[STRIDED_VEC4]] to <4 x i32>
+; SSE2-NEXT: [[TMP16:%.*]] = mul nsw <4 x i32> [[TMP15]], [[TMP13]]
+; SSE2-NEXT: [[TMP17:%.*]] = add nsw <4 x i32> [[TMP16]], [[TMP10]]
+; SSE2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[D1:%.*]], i64 [[TMP0]]
+; SSE2-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP18]], i32 0
+; SSE2-NEXT: [[TMP20:%.*]] = bitcast i32* [[TMP19]] to <4 x i32>*
+; SSE2-NEXT: store <4 x i32> [[TMP17]], <4 x i32>* [[TMP20]], align 4
; SSE2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; SSE2-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SSE2-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; SSE2-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SSE2-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; SSE2: middle.block:
; SSE2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; SSE2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; SSE2-NEXT: br label [[FOR_BODY:%.*]]
; SSE2: for.body:
; SSE2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; SSE2-NEXT: [[TMP19:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
-; SSE2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP19]]
-; SSE2-NEXT: [[TMP20:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
-; SSE2-NEXT: [[CONV:%.*]] = sext i16 [[TMP20]] to i32
-; SSE2-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP19]]
-; SSE2-NEXT: [[TMP21:%.*]] = load i16, i16* [[ARRAYIDX4]], align 2
-; SSE2-NEXT: [[CONV5:%.*]] = sext i16 [[TMP21]] to i32
+; SSE2-NEXT: [[TMP22:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
+; SSE2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP22]]
+; SSE2-NEXT: [[TMP23:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
+; SSE2-NEXT: [[CONV:%.*]] = sext i16 [[TMP23]] to i32
+; SSE2-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP22]]
+; SSE2-NEXT: [[TMP24:%.*]] = load i16, i16* [[ARRAYIDX4]], align 2
+; SSE2-NEXT: [[CONV5:%.*]] = sext i16 [[TMP24]] to i32
; SSE2-NEXT: [[MUL6:%.*]] = mul nsw i32 [[CONV5]], [[CONV]]
-; SSE2-NEXT: [[TMP22:%.*]] = or i64 [[TMP19]], 1
-; SSE2-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP22]]
-; SSE2-NEXT: [[TMP23:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2
-; SSE2-NEXT: [[CONV11:%.*]] = sext i16 [[TMP23]] to i32
-; SSE2-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP22]]
-; SSE2-NEXT: [[TMP24:%.*]] = load i16, i16* [[ARRAYIDX15]], align 2
-; SSE2-NEXT: [[CONV16:%.*]] = sext i16 [[TMP24]] to i32
+; SSE2-NEXT: [[TMP25:%.*]] = or i64 [[TMP22]], 1
+; SSE2-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP25]]
+; SSE2-NEXT: [[TMP26:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2
+; SSE2-NEXT: [[CONV11:%.*]] = sext i16 [[TMP26]] to i32
+; SSE2-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP25]]
+; SSE2-NEXT: [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX15]], align 2
+; SSE2-NEXT: [[CONV16:%.*]] = sext i16 [[TMP27]] to i32
; SSE2-NEXT: [[MUL17:%.*]] = mul nsw i32 [[CONV16]], [[CONV11]]
; SSE2-NEXT: [[ADD18:%.*]] = add nsw i32 [[MUL17]], [[MUL6]]
; SSE2-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, i32* [[D1]], i64 [[INDVARS_IV]]
; SSE41-NEXT: [[TMP19:%.*]] = sext <4 x i16> [[STRIDED_VEC8]] to <4 x i32>
; SSE41-NEXT: [[TMP20:%.*]] = mul nsw <4 x i32> [[TMP18]], [[TMP10]]
; SSE41-NEXT: [[TMP21:%.*]] = mul nsw <4 x i32> [[TMP19]], [[TMP11]]
-; SSE41-NEXT: [[TMP22:%.*]] = sext <4 x i16> [[STRIDED_VEC3]] to <4 x i32>
-; SSE41-NEXT: [[TMP23:%.*]] = sext <4 x i16> [[STRIDED_VEC4]] to <4 x i32>
-; SSE41-NEXT: [[TMP24:%.*]] = sext <4 x i16> [[STRIDED_VEC9]] to <4 x i32>
-; SSE41-NEXT: [[TMP25:%.*]] = sext <4 x i16> [[STRIDED_VEC10]] to <4 x i32>
-; SSE41-NEXT: [[TMP26:%.*]] = mul nsw <4 x i32> [[TMP24]], [[TMP22]]
-; SSE41-NEXT: [[TMP27:%.*]] = mul nsw <4 x i32> [[TMP25]], [[TMP23]]
-; SSE41-NEXT: [[TMP28:%.*]] = add nsw <4 x i32> [[TMP26]], [[TMP20]]
-; SSE41-NEXT: [[TMP29:%.*]] = add nsw <4 x i32> [[TMP27]], [[TMP21]]
-; SSE41-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[D1:%.*]], i64 [[TMP0]]
-; SSE41-NEXT: [[TMP31:%.*]] = getelementptr inbounds i32, i32* [[D1]], i64 [[TMP1]]
-; SSE41-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, i32* [[TMP30]], i32 0
-; SSE41-NEXT: [[TMP33:%.*]] = bitcast i32* [[TMP32]] to <4 x i32>*
-; SSE41-NEXT: store <4 x i32> [[TMP28]], <4 x i32>* [[TMP33]], align 4
-; SSE41-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, i32* [[TMP30]], i32 4
-; SSE41-NEXT: [[TMP35:%.*]] = bitcast i32* [[TMP34]] to <4 x i32>*
-; SSE41-NEXT: store <4 x i32> [[TMP29]], <4 x i32>* [[TMP35]], align 4
+; SSE41-NEXT: [[TMP26:%.*]] = sext <4 x i16> [[STRIDED_VEC3]] to <4 x i32>
+; SSE41-NEXT: [[TMP27:%.*]] = sext <4 x i16> [[STRIDED_VEC4]] to <4 x i32>
+; SSE41-NEXT: [[TMP30:%.*]] = sext <4 x i16> [[STRIDED_VEC9]] to <4 x i32>
+; SSE41-NEXT: [[TMP31:%.*]] = sext <4 x i16> [[STRIDED_VEC10]] to <4 x i32>
+; SSE41-NEXT: [[TMP32:%.*]] = mul nsw <4 x i32> [[TMP30]], [[TMP26]]
+; SSE41-NEXT: [[TMP33:%.*]] = mul nsw <4 x i32> [[TMP31]], [[TMP27]]
+; SSE41-NEXT: [[TMP34:%.*]] = add nsw <4 x i32> [[TMP32]], [[TMP20]]
+; SSE41-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[TMP33]], [[TMP21]]
+; SSE41-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[D1:%.*]], i64 [[TMP0]]
+; SSE41-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, i32* [[D1]], i64 [[TMP1]]
+; SSE41-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[TMP36]], i32 0
+; SSE41-NEXT: [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <4 x i32>*
+; SSE41-NEXT: store <4 x i32> [[TMP34]], <4 x i32>* [[TMP39]], align 4
+; SSE41-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TMP36]], i32 4
+; SSE41-NEXT: [[TMP41:%.*]] = bitcast i32* [[TMP40]] to <4 x i32>*
+; SSE41-NEXT: store <4 x i32> [[TMP35]], <4 x i32>* [[TMP41]], align 4
; SSE41-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; SSE41-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; SSE41-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; SSE41-NEXT: [[TMP42:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; SSE41-NEXT: br i1 [[TMP42]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; SSE41: middle.block:
; SSE41-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; SSE41-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; SSE41-NEXT: br label [[FOR_BODY:%.*]]
; SSE41: for.body:
; SSE41-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; SSE41-NEXT: [[TMP37:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
-; SSE41-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP37]]
-; SSE41-NEXT: [[TMP38:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
-; SSE41-NEXT: [[CONV:%.*]] = sext i16 [[TMP38]] to i32
-; SSE41-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP37]]
-; SSE41-NEXT: [[TMP39:%.*]] = load i16, i16* [[ARRAYIDX4]], align 2
-; SSE41-NEXT: [[CONV5:%.*]] = sext i16 [[TMP39]] to i32
+; SSE41-NEXT: [[TMP43:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
+; SSE41-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP43]]
+; SSE41-NEXT: [[TMP44:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
+; SSE41-NEXT: [[CONV:%.*]] = sext i16 [[TMP44]] to i32
+; SSE41-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP43]]
+; SSE41-NEXT: [[TMP45:%.*]] = load i16, i16* [[ARRAYIDX4]], align 2
+; SSE41-NEXT: [[CONV5:%.*]] = sext i16 [[TMP45]] to i32
; SSE41-NEXT: [[MUL6:%.*]] = mul nsw i32 [[CONV5]], [[CONV]]
-; SSE41-NEXT: [[TMP40:%.*]] = or i64 [[TMP37]], 1
-; SSE41-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP40]]
-; SSE41-NEXT: [[TMP41:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2
-; SSE41-NEXT: [[CONV11:%.*]] = sext i16 [[TMP41]] to i32
-; SSE41-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP40]]
-; SSE41-NEXT: [[TMP42:%.*]] = load i16, i16* [[ARRAYIDX15]], align 2
-; SSE41-NEXT: [[CONV16:%.*]] = sext i16 [[TMP42]] to i32
+; SSE41-NEXT: [[TMP46:%.*]] = or i64 [[TMP43]], 1
+; SSE41-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP46]]
+; SSE41-NEXT: [[TMP47:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2
+; SSE41-NEXT: [[CONV11:%.*]] = sext i16 [[TMP47]] to i32
+; SSE41-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP46]]
+; SSE41-NEXT: [[TMP48:%.*]] = load i16, i16* [[ARRAYIDX15]], align 2
+; SSE41-NEXT: [[CONV16:%.*]] = sext i16 [[TMP48]] to i32
; SSE41-NEXT: [[MUL17:%.*]] = mul nsw i32 [[CONV16]], [[CONV11]]
; SSE41-NEXT: [[ADD18:%.*]] = add nsw i32 [[MUL17]], [[MUL6]]
; SSE41-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, i32* [[D1]], i64 [[INDVARS_IV]]
; AVX1-NEXT: [[TMP41:%.*]] = mul nsw <4 x i32> [[TMP37]], [[TMP21]]
; AVX1-NEXT: [[TMP42:%.*]] = mul nsw <4 x i32> [[TMP38]], [[TMP22]]
; AVX1-NEXT: [[TMP43:%.*]] = mul nsw <4 x i32> [[TMP39]], [[TMP23]]
-; AVX1-NEXT: [[TMP44:%.*]] = sext <4 x i16> [[STRIDED_VEC7]] to <4 x i32>
-; AVX1-NEXT: [[TMP45:%.*]] = sext <4 x i16> [[STRIDED_VEC8]] to <4 x i32>
-; AVX1-NEXT: [[TMP46:%.*]] = sext <4 x i16> [[STRIDED_VEC9]] to <4 x i32>
-; AVX1-NEXT: [[TMP47:%.*]] = sext <4 x i16> [[STRIDED_VEC10]] to <4 x i32>
-; AVX1-NEXT: [[TMP48:%.*]] = sext <4 x i16> [[STRIDED_VEC19]] to <4 x i32>
-; AVX1-NEXT: [[TMP49:%.*]] = sext <4 x i16> [[STRIDED_VEC20]] to <4 x i32>
-; AVX1-NEXT: [[TMP50:%.*]] = sext <4 x i16> [[STRIDED_VEC21]] to <4 x i32>
-; AVX1-NEXT: [[TMP51:%.*]] = sext <4 x i16> [[STRIDED_VEC22]] to <4 x i32>
-; AVX1-NEXT: [[TMP52:%.*]] = mul nsw <4 x i32> [[TMP48]], [[TMP44]]
-; AVX1-NEXT: [[TMP53:%.*]] = mul nsw <4 x i32> [[TMP49]], [[TMP45]]
-; AVX1-NEXT: [[TMP54:%.*]] = mul nsw <4 x i32> [[TMP50]], [[TMP46]]
-; AVX1-NEXT: [[TMP55:%.*]] = mul nsw <4 x i32> [[TMP51]], [[TMP47]]
-; AVX1-NEXT: [[TMP56:%.*]] = add nsw <4 x i32> [[TMP52]], [[TMP40]]
-; AVX1-NEXT: [[TMP57:%.*]] = add nsw <4 x i32> [[TMP53]], [[TMP41]]
-; AVX1-NEXT: [[TMP58:%.*]] = add nsw <4 x i32> [[TMP54]], [[TMP42]]
-; AVX1-NEXT: [[TMP59:%.*]] = add nsw <4 x i32> [[TMP55]], [[TMP43]]
-; AVX1-NEXT: [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[D1:%.*]], i64 [[TMP0]]
-; AVX1-NEXT: [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[D1]], i64 [[TMP1]]
-; AVX1-NEXT: [[TMP62:%.*]] = getelementptr inbounds i32, i32* [[D1]], i64 [[TMP2]]
-; AVX1-NEXT: [[TMP63:%.*]] = getelementptr inbounds i32, i32* [[D1]], i64 [[TMP3]]
-; AVX1-NEXT: [[TMP64:%.*]] = getelementptr inbounds i32, i32* [[TMP60]], i32 0
-; AVX1-NEXT: [[TMP65:%.*]] = bitcast i32* [[TMP64]] to <4 x i32>*
-; AVX1-NEXT: store <4 x i32> [[TMP56]], <4 x i32>* [[TMP65]], align 4
-; AVX1-NEXT: [[TMP66:%.*]] = getelementptr inbounds i32, i32* [[TMP60]], i32 4
-; AVX1-NEXT: [[TMP67:%.*]] = bitcast i32* [[TMP66]] to <4 x i32>*
-; AVX1-NEXT: store <4 x i32> [[TMP57]], <4 x i32>* [[TMP67]], align 4
-; AVX1-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[TMP60]], i32 8
-; AVX1-NEXT: [[TMP69:%.*]] = bitcast i32* [[TMP68]] to <4 x i32>*
-; AVX1-NEXT: store <4 x i32> [[TMP58]], <4 x i32>* [[TMP69]], align 4
-; AVX1-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TMP60]], i32 12
-; AVX1-NEXT: [[TMP71:%.*]] = bitcast i32* [[TMP70]] to <4 x i32>*
-; AVX1-NEXT: store <4 x i32> [[TMP59]], <4 x i32>* [[TMP71]], align 4
+; AVX1-NEXT: [[TMP52:%.*]] = sext <4 x i16> [[STRIDED_VEC7]] to <4 x i32>
+; AVX1-NEXT: [[TMP53:%.*]] = sext <4 x i16> [[STRIDED_VEC8]] to <4 x i32>
+; AVX1-NEXT: [[TMP54:%.*]] = sext <4 x i16> [[STRIDED_VEC9]] to <4 x i32>
+; AVX1-NEXT: [[TMP55:%.*]] = sext <4 x i16> [[STRIDED_VEC10]] to <4 x i32>
+; AVX1-NEXT: [[TMP60:%.*]] = sext <4 x i16> [[STRIDED_VEC19]] to <4 x i32>
+; AVX1-NEXT: [[TMP61:%.*]] = sext <4 x i16> [[STRIDED_VEC20]] to <4 x i32>
+; AVX1-NEXT: [[TMP62:%.*]] = sext <4 x i16> [[STRIDED_VEC21]] to <4 x i32>
+; AVX1-NEXT: [[TMP63:%.*]] = sext <4 x i16> [[STRIDED_VEC22]] to <4 x i32>
+; AVX1-NEXT: [[TMP64:%.*]] = mul nsw <4 x i32> [[TMP60]], [[TMP52]]
+; AVX1-NEXT: [[TMP65:%.*]] = mul nsw <4 x i32> [[TMP61]], [[TMP53]]
+; AVX1-NEXT: [[TMP66:%.*]] = mul nsw <4 x i32> [[TMP62]], [[TMP54]]
+; AVX1-NEXT: [[TMP67:%.*]] = mul nsw <4 x i32> [[TMP63]], [[TMP55]]
+; AVX1-NEXT: [[TMP68:%.*]] = add nsw <4 x i32> [[TMP64]], [[TMP40]]
+; AVX1-NEXT: [[TMP69:%.*]] = add nsw <4 x i32> [[TMP65]], [[TMP41]]
+; AVX1-NEXT: [[TMP70:%.*]] = add nsw <4 x i32> [[TMP66]], [[TMP42]]
+; AVX1-NEXT: [[TMP71:%.*]] = add nsw <4 x i32> [[TMP67]], [[TMP43]]
+; AVX1-NEXT: [[TMP72:%.*]] = getelementptr inbounds i32, i32* [[D1:%.*]], i64 [[TMP0]]
+; AVX1-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[D1]], i64 [[TMP1]]
+; AVX1-NEXT: [[TMP74:%.*]] = getelementptr inbounds i32, i32* [[D1]], i64 [[TMP2]]
+; AVX1-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[D1]], i64 [[TMP3]]
+; AVX1-NEXT: [[TMP76:%.*]] = getelementptr inbounds i32, i32* [[TMP72]], i32 0
+; AVX1-NEXT: [[TMP77:%.*]] = bitcast i32* [[TMP76]] to <4 x i32>*
+; AVX1-NEXT: store <4 x i32> [[TMP68]], <4 x i32>* [[TMP77]], align 4
+; AVX1-NEXT: [[TMP78:%.*]] = getelementptr inbounds i32, i32* [[TMP72]], i32 4
+; AVX1-NEXT: [[TMP79:%.*]] = bitcast i32* [[TMP78]] to <4 x i32>*
+; AVX1-NEXT: store <4 x i32> [[TMP69]], <4 x i32>* [[TMP79]], align 4
+; AVX1-NEXT: [[TMP80:%.*]] = getelementptr inbounds i32, i32* [[TMP72]], i32 8
+; AVX1-NEXT: [[TMP81:%.*]] = bitcast i32* [[TMP80]] to <4 x i32>*
+; AVX1-NEXT: store <4 x i32> [[TMP70]], <4 x i32>* [[TMP81]], align 4
+; AVX1-NEXT: [[TMP82:%.*]] = getelementptr inbounds i32, i32* [[TMP72]], i32 12
+; AVX1-NEXT: [[TMP83:%.*]] = bitcast i32* [[TMP82]] to <4 x i32>*
+; AVX1-NEXT: store <4 x i32> [[TMP71]], <4 x i32>* [[TMP83]], align 4
; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; AVX1-NEXT: [[TMP72:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; AVX1-NEXT: br i1 [[TMP72]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; AVX1-NEXT: [[TMP84:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; AVX1-NEXT: br i1 [[TMP84]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; AVX1: middle.block:
; AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; AVX1-NEXT: br label [[FOR_BODY:%.*]]
; AVX1: for.body:
; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; AVX1-NEXT: [[TMP73:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
-; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP73]]
-; AVX1-NEXT: [[TMP74:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
-; AVX1-NEXT: [[CONV:%.*]] = sext i16 [[TMP74]] to i32
-; AVX1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP73]]
-; AVX1-NEXT: [[TMP75:%.*]] = load i16, i16* [[ARRAYIDX4]], align 2
-; AVX1-NEXT: [[CONV5:%.*]] = sext i16 [[TMP75]] to i32
+; AVX1-NEXT: [[TMP85:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
+; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP85]]
+; AVX1-NEXT: [[TMP86:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
+; AVX1-NEXT: [[CONV:%.*]] = sext i16 [[TMP86]] to i32
+; AVX1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP85]]
+; AVX1-NEXT: [[TMP87:%.*]] = load i16, i16* [[ARRAYIDX4]], align 2
+; AVX1-NEXT: [[CONV5:%.*]] = sext i16 [[TMP87]] to i32
; AVX1-NEXT: [[MUL6:%.*]] = mul nsw i32 [[CONV5]], [[CONV]]
-; AVX1-NEXT: [[TMP76:%.*]] = or i64 [[TMP73]], 1
-; AVX1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP76]]
-; AVX1-NEXT: [[TMP77:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2
-; AVX1-NEXT: [[CONV11:%.*]] = sext i16 [[TMP77]] to i32
-; AVX1-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP76]]
-; AVX1-NEXT: [[TMP78:%.*]] = load i16, i16* [[ARRAYIDX15]], align 2
-; AVX1-NEXT: [[CONV16:%.*]] = sext i16 [[TMP78]] to i32
+; AVX1-NEXT: [[TMP88:%.*]] = or i64 [[TMP85]], 1
+; AVX1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP88]]
+; AVX1-NEXT: [[TMP89:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2
+; AVX1-NEXT: [[CONV11:%.*]] = sext i16 [[TMP89]] to i32
+; AVX1-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP88]]
+; AVX1-NEXT: [[TMP90:%.*]] = load i16, i16* [[ARRAYIDX15]], align 2
+; AVX1-NEXT: [[CONV16:%.*]] = sext i16 [[TMP90]] to i32
; AVX1-NEXT: [[MUL17:%.*]] = mul nsw i32 [[CONV16]], [[CONV11]]
; AVX1-NEXT: [[ADD18:%.*]] = add nsw i32 [[MUL17]], [[MUL6]]
; AVX1-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, i32* [[D1]], i64 [[INDVARS_IV]]
; AVX2-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i16> [[WIDE_VEC2]], <16 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
; AVX2-NEXT: [[TMP9:%.*]] = sext <8 x i16> [[STRIDED_VEC3]] to <8 x i32>
; AVX2-NEXT: [[TMP10:%.*]] = mul nsw <8 x i32> [[TMP9]], [[TMP5]]
-; AVX2-NEXT: [[TMP11:%.*]] = sext <8 x i16> [[STRIDED_VEC1]] to <8 x i32>
-; AVX2-NEXT: [[TMP12:%.*]] = sext <8 x i16> [[STRIDED_VEC4]] to <8 x i32>
-; AVX2-NEXT: [[TMP13:%.*]] = mul nsw <8 x i32> [[TMP12]], [[TMP11]]
-; AVX2-NEXT: [[TMP14:%.*]] = add nsw <8 x i32> [[TMP13]], [[TMP10]]
-; AVX2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[D1:%.*]], i64 [[TMP0]]
-; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP15]], i32 0
-; AVX2-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <8 x i32>*
-; AVX2-NEXT: store <8 x i32> [[TMP14]], <8 x i32>* [[TMP17]], align 4
+; AVX2-NEXT: [[TMP13:%.*]] = sext <8 x i16> [[STRIDED_VEC1]] to <8 x i32>
+; AVX2-NEXT: [[TMP15:%.*]] = sext <8 x i16> [[STRIDED_VEC4]] to <8 x i32>
+; AVX2-NEXT: [[TMP16:%.*]] = mul nsw <8 x i32> [[TMP15]], [[TMP13]]
+; AVX2-NEXT: [[TMP17:%.*]] = add nsw <8 x i32> [[TMP16]], [[TMP10]]
+; AVX2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[D1:%.*]], i64 [[TMP0]]
+; AVX2-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP18]], i32 0
+; AVX2-NEXT: [[TMP20:%.*]] = bitcast i32* [[TMP19]] to <8 x i32>*
+; AVX2-NEXT: store <8 x i32> [[TMP17]], <8 x i32>* [[TMP20]], align 4
; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; AVX2-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; AVX2-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; AVX2-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; AVX2-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; AVX2: middle.block:
; AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; AVX2-NEXT: br label [[FOR_BODY:%.*]]
; AVX2: for.body:
; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; AVX2-NEXT: [[TMP19:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
-; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP19]]
-; AVX2-NEXT: [[TMP20:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
-; AVX2-NEXT: [[CONV:%.*]] = sext i16 [[TMP20]] to i32
-; AVX2-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP19]]
-; AVX2-NEXT: [[TMP21:%.*]] = load i16, i16* [[ARRAYIDX4]], align 2
-; AVX2-NEXT: [[CONV5:%.*]] = sext i16 [[TMP21]] to i32
+; AVX2-NEXT: [[TMP22:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
+; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP22]]
+; AVX2-NEXT: [[TMP23:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
+; AVX2-NEXT: [[CONV:%.*]] = sext i16 [[TMP23]] to i32
+; AVX2-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP22]]
+; AVX2-NEXT: [[TMP24:%.*]] = load i16, i16* [[ARRAYIDX4]], align 2
+; AVX2-NEXT: [[CONV5:%.*]] = sext i16 [[TMP24]] to i32
; AVX2-NEXT: [[MUL6:%.*]] = mul nsw i32 [[CONV5]], [[CONV]]
-; AVX2-NEXT: [[TMP22:%.*]] = or i64 [[TMP19]], 1
-; AVX2-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP22]]
-; AVX2-NEXT: [[TMP23:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2
-; AVX2-NEXT: [[CONV11:%.*]] = sext i16 [[TMP23]] to i32
-; AVX2-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP22]]
-; AVX2-NEXT: [[TMP24:%.*]] = load i16, i16* [[ARRAYIDX15]], align 2
-; AVX2-NEXT: [[CONV16:%.*]] = sext i16 [[TMP24]] to i32
+; AVX2-NEXT: [[TMP25:%.*]] = or i64 [[TMP22]], 1
+; AVX2-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[S1]], i64 [[TMP25]]
+; AVX2-NEXT: [[TMP26:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2
+; AVX2-NEXT: [[CONV11:%.*]] = sext i16 [[TMP26]] to i32
+; AVX2-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, i16* [[S2]], i64 [[TMP25]]
+; AVX2-NEXT: [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX15]], align 2
+; AVX2-NEXT: [[CONV16:%.*]] = sext i16 [[TMP27]] to i32
; AVX2-NEXT: [[MUL17:%.*]] = mul nsw i32 [[CONV16]], [[CONV11]]
; AVX2-NEXT: [[ADD18:%.*]] = add nsw i32 [[MUL17]], [[MUL6]]
; AVX2-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, i32* [[D1]], i64 [[INDVARS_IV]]
define void @foo(i64* %p, i64* %p.last) unnamed_addr #0 {
; CHECK-LABEL: @foo(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[P4:%.*]] = ptrtoint i64* [[P:%.*]] to i64
-; CHECK-NEXT: [[P_LAST1:%.*]] = ptrtoint i64* [[P_LAST:%.*]] to i64
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[P_LAST1]], -1024
-; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[P4]]
-; CHECK-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 10
-; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 16
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 16
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
-; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[N_VEC]], 128
-; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i64, i64* [[P]], i64 [[TMP4]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi i64* [ [[P]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i64, i64* [[POINTER_PHI]], <4 x i64> <i64 0, i64 128, i64 256, i64 384>
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i64, i64* [[POINTER_PHI]], <4 x i64> <i64 512, i64 640, i64 768, i64 896>
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i64, i64* [[POINTER_PHI]], <4 x i64> <i64 1024, i64 1152, i64 1280, i64 1408>
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, i64* [[POINTER_PHI]], <4 x i64> <i64 1536, i64 1664, i64 1792, i64 1920>
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i64*> [[TMP5]] to <4 x %0**>
-; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i64*> [[TMP6]] to <4 x %0**>
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i64*> [[TMP7]] to <4 x %0**>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i64*> [[TMP8]] to <4 x %0**>
-; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP9]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %0*> undef)
-; CHECK-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP10]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %0*> undef)
-; CHECK-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP11]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %0*> undef)
-; CHECK-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP12]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %0*> undef)
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT: [[PTR_IND]] = getelementptr i64, i64* [[POINTER_PHI]], i64 2048
-; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[P]], [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[LOOP:%.*]]
-; CHECK: loop:
-; CHECK-NEXT: [[P2:%.*]] = phi i64* [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[P_INC:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[P_INC]] = getelementptr inbounds i64, i64* [[P2]], i64 128
-; CHECK-NEXT: [[P3:%.*]] = bitcast i64* [[P2]] to %0**
-; CHECK-NEXT: [[V:%.*]] = load %0*, %0** [[P3]], align 8
-; CHECK-NEXT: [[B:%.*]] = icmp eq i64* [[P_INC]], [[P_LAST]]
-; CHECK-NEXT: br i1 [[B]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: exit:
-; CHECK-NEXT: ret void
+; CHECK: vector.body:
+; CHECK: [[WIDE_MASKED_GATHER:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP11:%.*]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %0*> undef)
+; CHECK-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP12:%.*]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %0*> undef)
+; CHECK-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP13:%.*]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %0*> undef)
+; CHECK-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <4 x %0*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.0(<4 x %0**> [[TMP14:%.*]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %0*> undef)
;
entry:
br label %loop
define void @bar(i64* %p, i64* %p.last) unnamed_addr #0 {
; CHECK-LABEL: @bar(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[P4:%.*]] = ptrtoint i64* [[P:%.*]] to i64
-; CHECK-NEXT: [[P_LAST1:%.*]] = ptrtoint i64* [[P_LAST:%.*]] to i64
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[P_LAST1]], -1024
-; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[P4]]
-; CHECK-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP1]], 10
-; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP3]], 16
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP3]], 16
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP3]], [[N_MOD_VF]]
-; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[N_VEC]], 128
-; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i64, i64* [[P]], i64 [[TMP4]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi i64* [ [[P]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i64, i64* [[POINTER_PHI]], <4 x i64> <i64 0, i64 128, i64 256, i64 384>
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i64, i64* [[POINTER_PHI]], <4 x i64> <i64 512, i64 640, i64 768, i64 896>
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i64, i64* [[POINTER_PHI]], <4 x i64> <i64 1024, i64 1152, i64 1280, i64 1408>
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, i64* [[POINTER_PHI]], <4 x i64> <i64 1536, i64 1664, i64 1792, i64 1920>
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i64*> [[TMP5]] to <4 x %1**>
-; CHECK-NEXT: [[TMP10:%.*]] = bitcast <4 x i64*> [[TMP6]] to <4 x %1**>
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast <4 x i64*> [[TMP7]] to <4 x %1**>
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast <4 x i64*> [[TMP8]] to <4 x %1**>
-; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP9]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %1*> undef)
-; CHECK-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP10]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %1*> undef)
-; CHECK-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP11]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %1*> undef)
-; CHECK-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP12]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %1*> undef)
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT: [[PTR_IND]] = getelementptr i64, i64* [[POINTER_PHI]], i64 2048
-; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[P]], [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[LOOP:%.*]]
-; CHECK: loop:
-; CHECK-NEXT: [[P2:%.*]] = phi i64* [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[P_INC:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[P_INC]] = getelementptr inbounds i64, i64* [[P2]], i64 128
-; CHECK-NEXT: [[P3:%.*]] = bitcast i64* [[P2]] to %1**
-; CHECK-NEXT: [[V:%.*]] = load %1*, %1** [[P3]], align 8
-; CHECK-NEXT: [[B:%.*]] = icmp eq i64* [[P_INC]], [[P_LAST]]
-; CHECK-NEXT: br i1 [[B]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK: exit:
-; CHECK-NEXT: ret void
+; CHECK: vector.body:
+; CHECK: [[WIDE_MASKED_GATHER:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP11:%.*]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %1*> undef)
+; CHECK-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP12:%.*]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %1*> undef)
+; CHECK-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP13:%.*]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %1*> undef)
+; CHECK-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <4 x %1*> @llvm.masked.gather.v4p0s_s.v4p0p0s_s.1(<4 x %1**> [[TMP14:%.*]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x %1*> undef)
;
entry:
br label %loop
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -loop-vectorize -mcpu=prescott -disable-basic-aa < %s | FileCheck %s
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
; PR15344
define void @test1(float* nocapture %arg, i32 %arg1) nounwind {
; CHECK-LABEL: @test1(
-; CHECK-NEXT: bb:
-; CHECK-NEXT: [[ARG2:%.*]] = bitcast float* [[ARG:%.*]] to i8*
-; CHECK-NEXT: br label [[BB2:%.*]]
-; CHECK: bb2:
-; CHECK-NEXT: [[TMP:%.*]] = load double, double* null, align 8
-; CHECK-NEXT: br i1 undef, label [[BB3_PREHEADER:%.*]], label [[BB12:%.*]]
-; CHECK: bb3.preheader:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[ARG1:%.*]], 4
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK: vector.memcheck:
-; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[ARG]], i32 [[ARG1]]
-; CHECK-NEXT: [[SCEVGEP3:%.*]] = bitcast float* [[SCEVGEP]] to i8*
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[ARG1]], 4
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[ARG1]], [[N_MOD_VF]]
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> zeroinitializer, double [[TMP]], i32 0
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x double> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <2 x double> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 2
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [16 x double], [16 x double]* undef, i32 0, i32 [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [16 x double], [16 x double]* undef, i32 0, i32 [[TMP2]]
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[TMP5]] to <2 x double>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 4, !alias.scope !0
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 2
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to <2 x double>*
-; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <2 x double>, <2 x double>* [[TMP8]], align 4, !alias.scope !0
-; CHECK-NEXT: [[TMP9]] = fadd fast <2 x double> [[VEC_PHI]], undef
-; CHECK-NEXT: [[TMP10]] = fadd fast <2 x double> [[VEC_PHI4]], undef
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[ARG]], i32 [[TMP1]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[ARG]], i32 [[TMP2]]
-; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[TMP11]], i32 0
-; CHECK-NEXT: [[TMP14:%.*]] = bitcast float* [[TMP13]] to <2 x float>*
-; CHECK-NEXT: store <2 x float> undef, <2 x float>* [[TMP14]], align 4, !alias.scope !3, !noalias !0
-; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP11]], i32 2
-; CHECK-NEXT: [[TMP16:%.*]] = bitcast float* [[TMP15]] to <2 x float>*
-; CHECK-NEXT: store <2 x float> undef, <2 x float>* [[TMP16]], align 4, !alias.scope !3, !noalias !0
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <2 x double> [[TMP10]], [[TMP9]]
-; CHECK-NEXT: [[TMP18:%.*]] = call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[BIN_RDX]])
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[ARG1]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[BB12_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[BB3_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP]], [[VECTOR_MEMCHECK]] ], [ [[TMP]], [[BB3_PREHEADER]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[BB3:%.*]]
-; CHECK: bb3:
-; CHECK-NEXT: [[TMP4:%.*]] = phi double [ [[TMP9:%.*]], [[BB3]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[TMP5:%.*]] = phi i32 [ [[TMP8:%.*]], [[BB3]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [16 x double], [16 x double]* undef, i32 0, i32 [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = load double, double* [[TMP6]], align 4
-; CHECK-NEXT: [[TMP8]] = add nsw i32 [[TMP5]], 1
-; CHECK-NEXT: [[TMP9]] = fadd fast double [[TMP4]], undef
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[ARG]], i32 [[TMP5]]
-; CHECK-NEXT: store float undef, float* [[TMP10]], align 4
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP8]], [[ARG1]]
-; CHECK-NEXT: br i1 [[TMP11]], label [[BB12_LOOPEXIT]], label [[BB3]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK: bb12.loopexit:
-; CHECK-NEXT: [[TMP9_LCSSA:%.*]] = phi double [ [[TMP9]], [[BB3]] ], [ [[TMP18]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[BB12]]
-; CHECK: bb12:
-; CHECK-NEXT: [[TMP13:%.*]] = phi double [ [[TMP]], [[BB2]] ], [ [[TMP9_LCSSA]], [[BB12_LOOPEXIT]] ]
-; CHECK-NEXT: ret void
-;
+; CHECK: preheader
+; CHECK: insertelement <2 x double> zeroinitializer, double %tmp, i32 0
+; CHECK: vector.memcheck
bb:
br label %bb2
; CHECK-NEXT: [[TMP9]] = fadd fast <4 x float> [[VEC_PHI1]], [[WIDE_LOAD2]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4096
-; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP9]], [[TMP8]]
; CHECK-NEXT: [[TMP11:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX]])
; CHECK-NEXT: [[SUM_INC]] = fadd fast float [[SUM]], [[VALUE]]
; CHECK-NEXT: [[IDX_INC]] = add i32 [[IDX]], 1
; CHECK-NEXT: [[BE_COND:%.*]] = icmp ne i32 [[IDX_INC]], 4096
-; CHECK-NEXT: br i1 [[BE_COND]], label [[LOOP]], label [[LOOP_EXIT_LOOPEXIT]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT: br i1 [[BE_COND]], label [[LOOP]], label [[LOOP_EXIT_LOOPEXIT]], [[LOOP2:!llvm.loop !.*]]
; CHECK: loop.exit.loopexit:
; CHECK-NEXT: [[SUM_INC_LCSSA:%.*]] = phi float [ [[SUM_INC]], [[LOOP]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: br label [[LOOP_EXIT]]
; CHECK-NEXT: [[TMP9]] = fadd reassoc <4 x float> [[VEC_PHI1]], [[WIDE_LOAD2]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4096
-; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd reassoc <4 x float> [[TMP9]], [[TMP8]]
; CHECK-NEXT: [[TMP11:%.*]] = call reassoc float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX]])
; CHECK-NEXT: [[SUM_INC]] = fadd reassoc float [[SUM]], [[VALUE]]
; CHECK-NEXT: [[IDX_INC]] = add i32 [[IDX]], 1
; CHECK-NEXT: [[BE_COND:%.*]] = icmp ne i32 [[IDX_INC]], 4096
-; CHECK-NEXT: br i1 [[BE_COND]], label [[LOOP]], label [[LOOP_EXIT_LOOPEXIT]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT: br i1 [[BE_COND]], label [[LOOP]], label [[LOOP_EXIT_LOOPEXIT]], [[LOOP5:!llvm.loop !.*]]
; CHECK: loop.exit.loopexit:
; CHECK-NEXT: [[SUM_INC_LCSSA:%.*]] = phi float [ [[SUM_INC]], [[LOOP]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: br label [[LOOP_EXIT]]
; CHECK-NEXT: [[TMP9]] = fadd reassoc contract <4 x float> [[VEC_PHI1]], [[WIDE_LOAD2]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4096
-; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd reassoc contract <4 x float> [[TMP9]], [[TMP8]]
; CHECK-NEXT: [[TMP11:%.*]] = call reassoc contract float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX]])
; CHECK-NEXT: [[SUM_INC]] = fadd reassoc contract float [[SUM]], [[VALUE]]
; CHECK-NEXT: [[IDX_INC]] = add i32 [[IDX]], 1
; CHECK-NEXT: [[BE_COND:%.*]] = icmp ne i32 [[IDX_INC]], 4096
-; CHECK-NEXT: br i1 [[BE_COND]], label [[LOOP]], label [[LOOP_EXIT_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT: br i1 [[BE_COND]], label [[LOOP]], label [[LOOP_EXIT_LOOPEXIT]], [[LOOP7:!llvm.loop !.*]]
; CHECK: loop.exit.loopexit:
; CHECK-NEXT: [[SUM_INC_LCSSA:%.*]] = phi float [ [[SUM_INC]], [[LOOP]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: br label [[LOOP_EXIT]]
; CHECK-NEXT: [[TMP11]] = select <4 x i1> [[TMP9]], <4 x float> [[WIDE_LOAD2]], <4 x float> [[VEC_PHI1]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp nnan ninf nsz ogt <4 x float> [[TMP10]], [[TMP11]]
; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select nnan ninf nsz <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP10]], <4 x float> [[TMP11]]
; CHECK-NEXT: [[MAX_0_]] = select i1 [[CMP1_INV]], float [[TMP14]], float [[MAX_013]]
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], [[LOOP9:!llvm.loop !.*]]
;
entry:
%cmp12 = icmp sgt i32 %N, 0
; CHECK-NEXT: [[TMP11]] = select <4 x i1> [[TMP9]], <4 x float> [[WIDE_LOAD2]], <4 x float> [[VEC_PHI1]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP10:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp nnan ninf ogt <4 x float> [[TMP10]], [[TMP11]]
; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select nnan ninf <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP10]], <4 x float> [[TMP11]]
; CHECK-NEXT: [[MAX_0_]] = select nnan ninf i1 [[CMP1_INV]], float [[TMP14]], float [[MAX_013]]
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], [[LOOP11:!llvm.loop !.*]]
;
entry:
%cmp12 = icmp sgt i32 %N, 0
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; REQUIRES: asserts
; RUN: opt < %s -loop-vectorize -mcpu=core-axv2 -force-vector-interleave=1 -dce -instcombine -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s
; CHECK: LV: Found an estimated cost of {{[0-9]+}} for VF 2 For instruction: br
;
define i8 @reduction_i8(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %n) {
-; CHECK-LABEL: @reduction_i8(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP_12:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; CHECK-NEXT: br i1 [[CMP_12]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK: for.body.preheader:
-; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1
-; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
-; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 3
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934588
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i8> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <4 x i8>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP4]], align 1
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <4 x i8>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP6]], align 1
-; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i8> [[VEC_PHI]], [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP8]] = add <4 x i8> [[TMP7]], [[WIDE_LOAD1]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[TMP10:%.*]] = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> [[TMP8]])
-; CHECK-NEXT: [[TMP11:%.*]] = zext i8 [[TMP10]] to i32
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP11]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.cond.for.cond.cleanup_crit_edge:
-; CHECK-NEXT: [[ADD5_LCSSA:%.*]] = phi i32 [ [[ADD5:%.*]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: [[CONV6:%.*]] = trunc i32 [[ADD5_LCSSA]] to i8
-; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i8 [ [[CONV6]], [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: ret i8 [[SUM_0_LCSSA]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[SUM_013:%.*]] = phi i32 [ [[ADD5]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP12:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP12]] to i32
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, i8* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP13:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1
-; CHECK-NEXT: [[CONV3:%.*]] = zext i8 [[TMP13]] to i32
-; CHECK-NEXT: [[CONV4:%.*]] = and i32 [[SUM_013]], 255
-; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[CONV4]], [[CONV]]
-; CHECK-NEXT: [[ADD5]] = add nuw nsw i32 [[ADD]], [[CONV3]]
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-;
entry:
%cmp.12 = icmp sgt i32 %n, 0
br i1 %cmp.12, label %for.body.preheader, label %for.cond.cleanup
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -debug-only=loop-vectorize -loop-vectorize -vectorizer-maximize-bandwidth -mtriple=x86_64-unknown-linux -S 2>&1 | FileCheck %s
; RUN: opt < %s -debug-only=loop-vectorize -loop-vectorize -vectorizer-maximize-bandwidth -mtriple=x86_64-unknown-linux -mattr=+avx512f -S 2>&1 | FileCheck %s --check-prefix=AVX512F
; REQUIRES: asserts
; This function has a loop of SAD pattern. Here we check when VF = 16 the
; register usage doesn't exceed 16.
;
-; CHECK-LABEL: @foo(
-; CHECK-NEXT: iter.check:
-; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; CHECK: vector.main.loop.iter.check:
-; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 [[TMP0]]
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[TMP3]], align 1
-; CHECK-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 [[TMP0]]
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP5]], i32 0
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <16 x i8>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP7]], align 1
-; CHECK-NEXT: [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-NEXT: [[TMP9:%.*]] = sub nsw <16 x i32> [[TMP4]], [[TMP8]]
-; CHECK-NEXT: [[TMP10:%.*]] = icmp sgt <16 x i32> [[TMP9]], <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
-; CHECK-NEXT: [[TMP11:%.*]] = sub nsw <16 x i32> zeroinitializer, [[TMP9]]
-; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP10]], <16 x i32> [[TMP9]], <16 x i32> [[TMP11]]
-; CHECK-NEXT: [[TMP13]] = add <16 x i32> [[TMP12]], [[VEC_PHI]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP13]])
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, 1024
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
-; CHECK: vec.epilog.iter.check:
-; CHECK-NEXT: br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
-; CHECK: vec.epilog.ph:
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP15]], [[VEC_EPILOG_ITER_CHECK]] ]
-; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT: [[TMP16:%.*]] = insertelement <8 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
-; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
-; CHECK: vec.epilog.vector.body:
-; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI5:%.*]] = phi <8 x i32> [ [[TMP16]], [[VEC_EPILOG_PH]] ], [ [[TMP30:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 [[TMP17]]
-; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, i8* [[TMP18]], i32 0
-; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8* [[TMP19]] to <8 x i8>*
-; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i8>, <8 x i8>* [[TMP20]], align 1
-; CHECK-NEXT: [[TMP21:%.*]] = zext <8 x i8> [[WIDE_LOAD6]] to <8 x i32>
-; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 [[TMP17]]
-; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, i8* [[TMP22]], i32 0
-; CHECK-NEXT: [[TMP24:%.*]] = bitcast i8* [[TMP23]] to <8 x i8>*
-; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x i8>, <8 x i8>* [[TMP24]], align 1
-; CHECK-NEXT: [[TMP25:%.*]] = zext <8 x i8> [[WIDE_LOAD7]] to <8 x i32>
-; CHECK-NEXT: [[TMP26:%.*]] = sub nsw <8 x i32> [[TMP21]], [[TMP25]]
-; CHECK-NEXT: [[TMP27:%.*]] = icmp sgt <8 x i32> [[TMP26]], <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
-; CHECK-NEXT: [[TMP28:%.*]] = sub nsw <8 x i32> zeroinitializer, [[TMP26]]
-; CHECK-NEXT: [[TMP29:%.*]] = select <8 x i1> [[TMP27]], <8 x i32> [[TMP26]], <8 x i32> [[TMP28]]
-; CHECK-NEXT: [[TMP30]] = add <8 x i32> [[TMP29]], [[VEC_PHI5]]
-; CHECK-NEXT: [[INDEX_NEXT8]] = add nuw i64 [[OFFSET_IDX]], 8
-; CHECK-NEXT: [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT8]], 1024
-; CHECK-NEXT: br i1 [[TMP31]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: vec.epilog.middle.block:
-; CHECK-NEXT: [[TMP32:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP30]])
-; CHECK-NEXT: [[CMP_N3:%.*]] = icmp eq i64 1024, 1024
-; CHECK-NEXT: br i1 [[CMP_N3]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
-; CHECK: vec.epilog.scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 1024, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX9:%.*]] = phi i32 [ 0, [[ITER_CHECK]] ], [ [[TMP15]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP32]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.cond.cleanup.loopexit:
-; CHECK-NEXT: [[ADD_LCSSA2:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[TMP32]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[TMP15]], [[MIDDLE_BLOCK]] ], [ [[ADD_LCSSA2]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; CHECK-NEXT: ret i32 [[ADD_LCSSA]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[S_015:%.*]] = phi i32 [ [[BC_MERGE_RDX9]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP33:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP33]] to i32
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP34:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1
-; CHECK-NEXT: [[CONV3:%.*]] = zext i8 [[TMP34]] to i32
-; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[CONV]], [[CONV3]]
-; CHECK-NEXT: [[ISPOS:%.*]] = icmp sgt i32 [[SUB]], -1
-; CHECK-NEXT: [[NEG:%.*]] = sub nsw i32 0, [[SUB]]
-; CHECK-NEXT: [[TMP35:%.*]] = select i1 [[ISPOS]], i32 [[SUB]], i32 [[NEG]]
-; CHECK-NEXT: [[ADD]] = add nsw i32 [[TMP35]], [[S_015]]
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-;
-; AVX512F-LABEL: @foo(
-; AVX512F-NEXT: iter.check:
-; AVX512F-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; AVX512F: vector.main.loop.iter.check:
-; AVX512F-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
-; AVX512F: vector.ph:
-; AVX512F-NEXT: br label [[VECTOR_BODY:%.*]]
-; AVX512F: vector.body:
-; AVX512F-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; AVX512F-NEXT: [[VEC_PHI:%.*]] = phi <64 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP26:%.*]], [[VECTOR_BODY]] ]
-; AVX512F-NEXT: [[VEC_PHI1:%.*]] = phi <64 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[VECTOR_BODY]] ]
-; AVX512F-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; AVX512F-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 64
-; AVX512F-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 [[TMP0]]
-; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 [[TMP1]]
-; AVX512F-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0
-; AVX512F-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <64 x i8>*
-; AVX512F-NEXT: [[WIDE_LOAD:%.*]] = load <64 x i8>, <64 x i8>* [[TMP5]], align 1
-; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 64
-; AVX512F-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <64 x i8>*
-; AVX512F-NEXT: [[WIDE_LOAD2:%.*]] = load <64 x i8>, <64 x i8>* [[TMP7]], align 1
-; AVX512F-NEXT: [[TMP8:%.*]] = zext <64 x i8> [[WIDE_LOAD]] to <64 x i32>
-; AVX512F-NEXT: [[TMP9:%.*]] = zext <64 x i8> [[WIDE_LOAD2]] to <64 x i32>
-; AVX512F-NEXT: [[TMP10:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 [[TMP0]]
-; AVX512F-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 [[TMP1]]
-; AVX512F-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP10]], i32 0
-; AVX512F-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <64 x i8>*
-; AVX512F-NEXT: [[WIDE_LOAD3:%.*]] = load <64 x i8>, <64 x i8>* [[TMP13]], align 1
-; AVX512F-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP10]], i32 64
-; AVX512F-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <64 x i8>*
-; AVX512F-NEXT: [[WIDE_LOAD4:%.*]] = load <64 x i8>, <64 x i8>* [[TMP15]], align 1
-; AVX512F-NEXT: [[TMP16:%.*]] = zext <64 x i8> [[WIDE_LOAD3]] to <64 x i32>
-; AVX512F-NEXT: [[TMP17:%.*]] = zext <64 x i8> [[WIDE_LOAD4]] to <64 x i32>
-; AVX512F-NEXT: [[TMP18:%.*]] = sub nsw <64 x i32> [[TMP8]], [[TMP16]]
-; AVX512F-NEXT: [[TMP19:%.*]] = sub nsw <64 x i32> [[TMP9]], [[TMP17]]
-; AVX512F-NEXT: [[TMP20:%.*]] = icmp sgt <64 x i32> [[TMP18]], <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
-; AVX512F-NEXT: [[TMP21:%.*]] = icmp sgt <64 x i32> [[TMP19]], <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
-; AVX512F-NEXT: [[TMP22:%.*]] = sub nsw <64 x i32> zeroinitializer, [[TMP18]]
-; AVX512F-NEXT: [[TMP23:%.*]] = sub nsw <64 x i32> zeroinitializer, [[TMP19]]
-; AVX512F-NEXT: [[TMP24:%.*]] = select <64 x i1> [[TMP20]], <64 x i32> [[TMP18]], <64 x i32> [[TMP22]]
-; AVX512F-NEXT: [[TMP25:%.*]] = select <64 x i1> [[TMP21]], <64 x i32> [[TMP19]], <64 x i32> [[TMP23]]
-; AVX512F-NEXT: [[TMP26]] = add <64 x i32> [[TMP24]], [[VEC_PHI]]
-; AVX512F-NEXT: [[TMP27]] = add <64 x i32> [[TMP25]], [[VEC_PHI1]]
-; AVX512F-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 128
-; AVX512F-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; AVX512F-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; AVX512F: middle.block:
-; AVX512F-NEXT: [[BIN_RDX:%.*]] = add <64 x i32> [[TMP27]], [[TMP26]]
-; AVX512F-NEXT: [[TMP29:%.*]] = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> [[BIN_RDX]])
-; AVX512F-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, 1024
-; AVX512F-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
-; AVX512F: vec.epilog.iter.check:
-; AVX512F-NEXT: br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
-; AVX512F: vec.epilog.ph:
-; AVX512F-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP29]], [[VEC_EPILOG_ITER_CHECK]] ]
-; AVX512F-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; AVX512F-NEXT: [[TMP30:%.*]] = insertelement <32 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
-; AVX512F-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
-; AVX512F: vec.epilog.vector.body:
-; AVX512F-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; AVX512F-NEXT: [[VEC_PHI8:%.*]] = phi <32 x i32> [ [[TMP30]], [[VEC_EPILOG_PH]] ], [ [[TMP44:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; AVX512F-NEXT: [[TMP31:%.*]] = add i64 [[OFFSET_IDX]], 0
-; AVX512F-NEXT: [[TMP32:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 [[TMP31]]
-; AVX512F-NEXT: [[TMP33:%.*]] = getelementptr inbounds i8, i8* [[TMP32]], i32 0
-; AVX512F-NEXT: [[TMP34:%.*]] = bitcast i8* [[TMP33]] to <32 x i8>*
-; AVX512F-NEXT: [[WIDE_LOAD9:%.*]] = load <32 x i8>, <32 x i8>* [[TMP34]], align 1
-; AVX512F-NEXT: [[TMP35:%.*]] = zext <32 x i8> [[WIDE_LOAD9]] to <32 x i32>
-; AVX512F-NEXT: [[TMP36:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 [[TMP31]]
-; AVX512F-NEXT: [[TMP37:%.*]] = getelementptr inbounds i8, i8* [[TMP36]], i32 0
-; AVX512F-NEXT: [[TMP38:%.*]] = bitcast i8* [[TMP37]] to <32 x i8>*
-; AVX512F-NEXT: [[WIDE_LOAD10:%.*]] = load <32 x i8>, <32 x i8>* [[TMP38]], align 1
-; AVX512F-NEXT: [[TMP39:%.*]] = zext <32 x i8> [[WIDE_LOAD10]] to <32 x i32>
-; AVX512F-NEXT: [[TMP40:%.*]] = sub nsw <32 x i32> [[TMP35]], [[TMP39]]
-; AVX512F-NEXT: [[TMP41:%.*]] = icmp sgt <32 x i32> [[TMP40]], <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
-; AVX512F-NEXT: [[TMP42:%.*]] = sub nsw <32 x i32> zeroinitializer, [[TMP40]]
-; AVX512F-NEXT: [[TMP43:%.*]] = select <32 x i1> [[TMP41]], <32 x i32> [[TMP40]], <32 x i32> [[TMP42]]
-; AVX512F-NEXT: [[TMP44]] = add <32 x i32> [[TMP43]], [[VEC_PHI8]]
-; AVX512F-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[OFFSET_IDX]], 32
-; AVX512F-NEXT: [[TMP45:%.*]] = icmp eq i64 [[INDEX_NEXT11]], 1024
-; AVX512F-NEXT: br i1 [[TMP45]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; AVX512F: vec.epilog.middle.block:
-; AVX512F-NEXT: [[TMP46:%.*]] = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> [[TMP44]])
-; AVX512F-NEXT: [[CMP_N6:%.*]] = icmp eq i64 1024, 1024
-; AVX512F-NEXT: br i1 [[CMP_N6]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
-; AVX512F: vec.epilog.scalar.ph:
-; AVX512F-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 1024, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
-; AVX512F-NEXT: [[BC_MERGE_RDX12:%.*]] = phi i32 [ 0, [[ITER_CHECK]] ], [ [[TMP29]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP46]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
-; AVX512F-NEXT: br label [[FOR_BODY:%.*]]
-; AVX512F: for.cond.cleanup.loopexit:
-; AVX512F-NEXT: [[ADD_LCSSA5:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[TMP46]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
-; AVX512F-NEXT: br label [[FOR_COND_CLEANUP]]
-; AVX512F: for.cond.cleanup:
-; AVX512F-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[TMP29]], [[MIDDLE_BLOCK]] ], [ [[ADD_LCSSA5]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; AVX512F-NEXT: ret i32 [[ADD_LCSSA]]
-; AVX512F: for.body:
-; AVX512F-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; AVX512F-NEXT: [[S_015:%.*]] = phi i32 [ [[BC_MERGE_RDX12]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD]], [[FOR_BODY]] ]
-; AVX512F-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 [[INDVARS_IV]]
-; AVX512F-NEXT: [[TMP47:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; AVX512F-NEXT: [[CONV:%.*]] = zext i8 [[TMP47]] to i32
-; AVX512F-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 [[INDVARS_IV]]
-; AVX512F-NEXT: [[TMP48:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1
-; AVX512F-NEXT: [[CONV3:%.*]] = zext i8 [[TMP48]] to i32
-; AVX512F-NEXT: [[SUB:%.*]] = sub nsw i32 [[CONV]], [[CONV3]]
-; AVX512F-NEXT: [[ISPOS:%.*]] = icmp sgt i32 [[SUB]], -1
-; AVX512F-NEXT: [[NEG:%.*]] = sub nsw i32 0, [[SUB]]
-; AVX512F-NEXT: [[TMP49:%.*]] = select i1 [[ISPOS]], i32 [[SUB]], i32 [[NEG]]
-; AVX512F-NEXT: [[ADD]] = add nsw i32 [[TMP49]], [[S_015]]
-; AVX512F-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; AVX512F-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024
-; AVX512F-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-;
+; CHECK-LABEL: foo
+; CHECK: LV(REG): VF = 8
+; CHECK-NEXT: LV(REG): Found max usage: 2 item
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 7 registers
+; CHECK-NEXT: LV(REG): Found invariant usage: 0 item
+; CHECK: LV(REG): VF = 16
+; CHECK-NEXT: LV(REG): Found max usage: 2 item
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 13 registers
+; CHECK-NEXT: LV(REG): Found invariant usage: 0 item
entry:
br label %for.body
; For indvars.iv used in a computating chain only feeding into getelementptr or cmp,
; it will not have vector version and the vector register usage will not exceed the
; available vector register number.
-; CHECK-LABEL: @goo(
-; CHECK-NEXT: iter.check:
-; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; CHECK: vector.main.loop.iter.check:
-; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add nsw i64 [[TMP0]], 3
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <16 x i8>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[TMP4]], align 1
-; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-NEXT: [[TMP6:%.*]] = add nsw i64 [[TMP0]], 2
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 [[TMP6]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i32 0
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to <16 x i8>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP9]], align 1
-; CHECK-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-NEXT: [[TMP11:%.*]] = sub nsw <16 x i32> [[TMP5]], [[TMP10]]
-; CHECK-NEXT: [[TMP12:%.*]] = icmp sgt <16 x i32> [[TMP11]], <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
-; CHECK-NEXT: [[TMP13:%.*]] = sub nsw <16 x i32> zeroinitializer, [[TMP11]]
-; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP11]], <16 x i32> [[TMP13]]
-; CHECK-NEXT: [[TMP15]] = add <16 x i32> [[TMP14]], [[VEC_PHI]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP15]])
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, 1024
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
-; CHECK: vec.epilog.iter.check:
-; CHECK-NEXT: br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
-; CHECK: vec.epilog.ph:
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP17]], [[VEC_EPILOG_ITER_CHECK]] ]
-; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT: [[TMP18:%.*]] = insertelement <8 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
-; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
-; CHECK: vec.epilog.vector.body:
-; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI5:%.*]] = phi <8 x i32> [ [[TMP18]], [[VEC_EPILOG_PH]] ], [ [[TMP34:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP19:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-NEXT: [[TMP20:%.*]] = add nsw i64 [[TMP19]], 3
-; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 [[TMP20]]
-; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, i8* [[TMP21]], i32 0
-; CHECK-NEXT: [[TMP23:%.*]] = bitcast i8* [[TMP22]] to <8 x i8>*
-; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i8>, <8 x i8>* [[TMP23]], align 1
-; CHECK-NEXT: [[TMP24:%.*]] = zext <8 x i8> [[WIDE_LOAD6]] to <8 x i32>
-; CHECK-NEXT: [[TMP25:%.*]] = add nsw i64 [[TMP19]], 2
-; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 [[TMP25]]
-; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, i8* [[TMP26]], i32 0
-; CHECK-NEXT: [[TMP28:%.*]] = bitcast i8* [[TMP27]] to <8 x i8>*
-; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x i8>, <8 x i8>* [[TMP28]], align 1
-; CHECK-NEXT: [[TMP29:%.*]] = zext <8 x i8> [[WIDE_LOAD7]] to <8 x i32>
-; CHECK-NEXT: [[TMP30:%.*]] = sub nsw <8 x i32> [[TMP24]], [[TMP29]]
-; CHECK-NEXT: [[TMP31:%.*]] = icmp sgt <8 x i32> [[TMP30]], <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
-; CHECK-NEXT: [[TMP32:%.*]] = sub nsw <8 x i32> zeroinitializer, [[TMP30]]
-; CHECK-NEXT: [[TMP33:%.*]] = select <8 x i1> [[TMP31]], <8 x i32> [[TMP30]], <8 x i32> [[TMP32]]
-; CHECK-NEXT: [[TMP34]] = add <8 x i32> [[TMP33]], [[VEC_PHI5]]
-; CHECK-NEXT: [[INDEX_NEXT8]] = add nuw i64 [[OFFSET_IDX]], 8
-; CHECK-NEXT: [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT8]], 1024
-; CHECK-NEXT: br i1 [[TMP35]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK: vec.epilog.middle.block:
-; CHECK-NEXT: [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP34]])
-; CHECK-NEXT: [[CMP_N3:%.*]] = icmp eq i64 1024, 1024
-; CHECK-NEXT: br i1 [[CMP_N3]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
-; CHECK: vec.epilog.scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 1024, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX9:%.*]] = phi i32 [ 0, [[ITER_CHECK]] ], [ [[TMP17]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP36]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.cond.cleanup.loopexit:
-; CHECK-NEXT: [[ADD_LCSSA2:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[TMP36]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[TMP17]], [[MIDDLE_BLOCK]] ], [ [[ADD_LCSSA2]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; CHECK-NEXT: ret i32 [[ADD_LCSSA]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[S_015:%.*]] = phi i32 [ [[BC_MERGE_RDX9]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP1:%.*]] = add nsw i64 [[INDVARS_IV]], 3
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 [[TMP1]]
-; CHECK-NEXT: [[TMP:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP]] to i32
-; CHECK-NEXT: [[TMP2:%.*]] = add nsw i64 [[INDVARS_IV]], 2
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 [[TMP2]]
-; CHECK-NEXT: [[TMP3:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1
-; CHECK-NEXT: [[CONV3:%.*]] = zext i8 [[TMP3]] to i32
-; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[CONV]], [[CONV3]]
-; CHECK-NEXT: [[ISPOS:%.*]] = icmp sgt i32 [[SUB]], -1
-; CHECK-NEXT: [[NEG:%.*]] = sub nsw i32 0, [[SUB]]
-; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[ISPOS]], i32 [[SUB]], i32 [[NEG]]
-; CHECK-NEXT: [[ADD]] = add nsw i32 [[TMP4]], [[S_015]]
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-;
-; AVX512F-LABEL: @goo(
-; AVX512F-NEXT: iter.check:
-; AVX512F-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; AVX512F: vector.main.loop.iter.check:
-; AVX512F-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
-; AVX512F: vector.ph:
-; AVX512F-NEXT: br label [[VECTOR_BODY:%.*]]
-; AVX512F: vector.body:
-; AVX512F-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; AVX512F-NEXT: [[VEC_PHI:%.*]] = phi <64 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ]
-; AVX512F-NEXT: [[VEC_PHI1:%.*]] = phi <64 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP31:%.*]], [[VECTOR_BODY]] ]
-; AVX512F-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; AVX512F-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 64
-; AVX512F-NEXT: [[TMP2:%.*]] = add nsw i64 [[TMP0]], 3
-; AVX512F-NEXT: [[TMP3:%.*]] = add nsw i64 [[TMP1]], 3
-; AVX512F-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 [[TMP2]]
-; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 [[TMP3]]
-; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 0
-; AVX512F-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <64 x i8>*
-; AVX512F-NEXT: [[WIDE_LOAD:%.*]] = load <64 x i8>, <64 x i8>* [[TMP7]], align 1
-; AVX512F-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 64
-; AVX512F-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP8]] to <64 x i8>*
-; AVX512F-NEXT: [[WIDE_LOAD2:%.*]] = load <64 x i8>, <64 x i8>* [[TMP9]], align 1
-; AVX512F-NEXT: [[TMP10:%.*]] = zext <64 x i8> [[WIDE_LOAD]] to <64 x i32>
-; AVX512F-NEXT: [[TMP11:%.*]] = zext <64 x i8> [[WIDE_LOAD2]] to <64 x i32>
-; AVX512F-NEXT: [[TMP12:%.*]] = add nsw i64 [[TMP0]], 2
-; AVX512F-NEXT: [[TMP13:%.*]] = add nsw i64 [[TMP1]], 2
-; AVX512F-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 [[TMP12]]
-; AVX512F-NEXT: [[TMP15:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 [[TMP13]]
-; AVX512F-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, i8* [[TMP14]], i32 0
-; AVX512F-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP16]] to <64 x i8>*
-; AVX512F-NEXT: [[WIDE_LOAD3:%.*]] = load <64 x i8>, <64 x i8>* [[TMP17]], align 1
-; AVX512F-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, i8* [[TMP14]], i32 64
-; AVX512F-NEXT: [[TMP19:%.*]] = bitcast i8* [[TMP18]] to <64 x i8>*
-; AVX512F-NEXT: [[WIDE_LOAD4:%.*]] = load <64 x i8>, <64 x i8>* [[TMP19]], align 1
-; AVX512F-NEXT: [[TMP20:%.*]] = zext <64 x i8> [[WIDE_LOAD3]] to <64 x i32>
-; AVX512F-NEXT: [[TMP21:%.*]] = zext <64 x i8> [[WIDE_LOAD4]] to <64 x i32>
-; AVX512F-NEXT: [[TMP22:%.*]] = sub nsw <64 x i32> [[TMP10]], [[TMP20]]
-; AVX512F-NEXT: [[TMP23:%.*]] = sub nsw <64 x i32> [[TMP11]], [[TMP21]]
-; AVX512F-NEXT: [[TMP24:%.*]] = icmp sgt <64 x i32> [[TMP22]], <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
-; AVX512F-NEXT: [[TMP25:%.*]] = icmp sgt <64 x i32> [[TMP23]], <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
-; AVX512F-NEXT: [[TMP26:%.*]] = sub nsw <64 x i32> zeroinitializer, [[TMP22]]
-; AVX512F-NEXT: [[TMP27:%.*]] = sub nsw <64 x i32> zeroinitializer, [[TMP23]]
-; AVX512F-NEXT: [[TMP28:%.*]] = select <64 x i1> [[TMP24]], <64 x i32> [[TMP22]], <64 x i32> [[TMP26]]
-; AVX512F-NEXT: [[TMP29:%.*]] = select <64 x i1> [[TMP25]], <64 x i32> [[TMP23]], <64 x i32> [[TMP27]]
-; AVX512F-NEXT: [[TMP30]] = add <64 x i32> [[TMP28]], [[VEC_PHI]]
-; AVX512F-NEXT: [[TMP31]] = add <64 x i32> [[TMP29]], [[VEC_PHI1]]
-; AVX512F-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 128
-; AVX512F-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; AVX512F-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; AVX512F: middle.block:
-; AVX512F-NEXT: [[BIN_RDX:%.*]] = add <64 x i32> [[TMP31]], [[TMP30]]
-; AVX512F-NEXT: [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> [[BIN_RDX]])
-; AVX512F-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, 1024
-; AVX512F-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
-; AVX512F: vec.epilog.iter.check:
-; AVX512F-NEXT: br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
-; AVX512F: vec.epilog.ph:
-; AVX512F-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP33]], [[VEC_EPILOG_ITER_CHECK]] ]
-; AVX512F-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; AVX512F-NEXT: [[TMP34:%.*]] = insertelement <32 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
-; AVX512F-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
-; AVX512F: vec.epilog.vector.body:
-; AVX512F-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; AVX512F-NEXT: [[VEC_PHI8:%.*]] = phi <32 x i32> [ [[TMP34]], [[VEC_EPILOG_PH]] ], [ [[TMP50:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; AVX512F-NEXT: [[TMP35:%.*]] = add i64 [[OFFSET_IDX]], 0
-; AVX512F-NEXT: [[TMP36:%.*]] = add nsw i64 [[TMP35]], 3
-; AVX512F-NEXT: [[TMP37:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 [[TMP36]]
-; AVX512F-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, i8* [[TMP37]], i32 0
-; AVX512F-NEXT: [[TMP39:%.*]] = bitcast i8* [[TMP38]] to <32 x i8>*
-; AVX512F-NEXT: [[WIDE_LOAD9:%.*]] = load <32 x i8>, <32 x i8>* [[TMP39]], align 1
-; AVX512F-NEXT: [[TMP40:%.*]] = zext <32 x i8> [[WIDE_LOAD9]] to <32 x i32>
-; AVX512F-NEXT: [[TMP41:%.*]] = add nsw i64 [[TMP35]], 2
-; AVX512F-NEXT: [[TMP42:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 [[TMP41]]
-; AVX512F-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, i8* [[TMP42]], i32 0
-; AVX512F-NEXT: [[TMP44:%.*]] = bitcast i8* [[TMP43]] to <32 x i8>*
-; AVX512F-NEXT: [[WIDE_LOAD10:%.*]] = load <32 x i8>, <32 x i8>* [[TMP44]], align 1
-; AVX512F-NEXT: [[TMP45:%.*]] = zext <32 x i8> [[WIDE_LOAD10]] to <32 x i32>
-; AVX512F-NEXT: [[TMP46:%.*]] = sub nsw <32 x i32> [[TMP40]], [[TMP45]]
-; AVX512F-NEXT: [[TMP47:%.*]] = icmp sgt <32 x i32> [[TMP46]], <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
-; AVX512F-NEXT: [[TMP48:%.*]] = sub nsw <32 x i32> zeroinitializer, [[TMP46]]
-; AVX512F-NEXT: [[TMP49:%.*]] = select <32 x i1> [[TMP47]], <32 x i32> [[TMP46]], <32 x i32> [[TMP48]]
-; AVX512F-NEXT: [[TMP50]] = add <32 x i32> [[TMP49]], [[VEC_PHI8]]
-; AVX512F-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[OFFSET_IDX]], 32
-; AVX512F-NEXT: [[TMP51:%.*]] = icmp eq i64 [[INDEX_NEXT11]], 1024
-; AVX512F-NEXT: br i1 [[TMP51]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; AVX512F: vec.epilog.middle.block:
-; AVX512F-NEXT: [[TMP52:%.*]] = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> [[TMP50]])
-; AVX512F-NEXT: [[CMP_N6:%.*]] = icmp eq i64 1024, 1024
-; AVX512F-NEXT: br i1 [[CMP_N6]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
-; AVX512F: vec.epilog.scalar.ph:
-; AVX512F-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 1024, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
-; AVX512F-NEXT: [[BC_MERGE_RDX12:%.*]] = phi i32 [ 0, [[ITER_CHECK]] ], [ [[TMP33]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP52]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
-; AVX512F-NEXT: br label [[FOR_BODY:%.*]]
-; AVX512F: for.cond.cleanup.loopexit:
-; AVX512F-NEXT: [[ADD_LCSSA5:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[TMP52]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
-; AVX512F-NEXT: br label [[FOR_COND_CLEANUP]]
-; AVX512F: for.cond.cleanup:
-; AVX512F-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[TMP33]], [[MIDDLE_BLOCK]] ], [ [[ADD_LCSSA5]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
-; AVX512F-NEXT: ret i32 [[ADD_LCSSA]]
-; AVX512F: for.body:
-; AVX512F-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; AVX512F-NEXT: [[S_015:%.*]] = phi i32 [ [[BC_MERGE_RDX12]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD]], [[FOR_BODY]] ]
-; AVX512F-NEXT: [[TMP1:%.*]] = add nsw i64 [[INDVARS_IV]], 3
-; AVX512F-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 [[TMP1]]
-; AVX512F-NEXT: [[TMP:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; AVX512F-NEXT: [[CONV:%.*]] = zext i8 [[TMP]] to i32
-; AVX512F-NEXT: [[TMP2:%.*]] = add nsw i64 [[INDVARS_IV]], 2
-; AVX512F-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 [[TMP2]]
-; AVX512F-NEXT: [[TMP3:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1
-; AVX512F-NEXT: [[CONV3:%.*]] = zext i8 [[TMP3]] to i32
-; AVX512F-NEXT: [[SUB:%.*]] = sub nsw i32 [[CONV]], [[CONV3]]
-; AVX512F-NEXT: [[ISPOS:%.*]] = icmp sgt i32 [[SUB]], -1
-; AVX512F-NEXT: [[NEG:%.*]] = sub nsw i32 0, [[SUB]]
-; AVX512F-NEXT: [[TMP4:%.*]] = select i1 [[ISPOS]], i32 [[SUB]], i32 [[NEG]]
-; AVX512F-NEXT: [[ADD]] = add nsw i32 [[TMP4]], [[S_015]]
-; AVX512F-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; AVX512F-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024
-; AVX512F-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-;
+; CHECK-LABEL: goo
+; CHECK: LV(REG): VF = 8
+; CHECK-NEXT: LV(REG): Found max usage: 2 item
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 7 registers
+; CHECK-NEXT: LV(REG): Found invariant usage: 0 item
+; CHECK: LV(REG): VF = 16
+; CHECK-NEXT: LV(REG): Found max usage: 2 item
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 13 registers
+; CHECK-NEXT: LV(REG): Found invariant usage: 0 item
entry:
br label %for.body
}
define i64 @bar(i64* nocapture %a) {
-; CHECK-LABEL: @bar(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, i64* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i64* [[TMP4]] to <2 x i64>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, <2 x i64>* [[TMP5]], align 8
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, i64* [[TMP2]], i32 2
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast i64* [[TMP6]] to <2 x i64>*
-; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <2 x i64>, <2 x i64>* [[TMP7]], align 8
-; CHECK-NEXT: [[TMP8:%.*]] = add nsw <2 x i64> [[WIDE_LOAD]], [[VEC_IND]]
-; CHECK-NEXT: [[TMP9:%.*]] = add nsw <2 x i64> [[WIDE_LOAD3]], [[STEP_ADD]]
-; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64* [[TMP4]] to <2 x i64>*
-; CHECK-NEXT: store <2 x i64> [[TMP8]], <2 x i64>* [[TMP10]], align 8
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64* [[TMP6]] to <2 x i64>*
-; CHECK-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* [[TMP11]], align 8
-; CHECK-NEXT: [[TMP12]] = add <2 x i64> [[TMP8]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP13]] = add <2 x i64> [[TMP9]], [[VEC_PHI2]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], <i64 2, i64 2>
-; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[TMP13]], [[TMP12]]
-; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]])
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, 1024
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: [[ADD2_LCSSA:%.*]] = phi i64 [ [[ADD2:%.*]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: ret i64 [[ADD2_LCSSA]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I_012:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[S_011:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD2]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[I_012]]
-; CHECK-NEXT: [[TMP16:%.*]] = load i64, i64* [[ARRAYIDX]], align 8
-; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP16]], [[I_012]]
-; CHECK-NEXT: store i64 [[ADD]], i64* [[ARRAYIDX]], align 8
-; CHECK-NEXT: [[ADD2]] = add nsw i64 [[ADD]], [[S_011]]
-; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_012]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], 1024
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-;
-; AVX512F-LABEL: @bar(
-; AVX512F-NEXT: entry:
-; AVX512F-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; AVX512F: vector.ph:
-; AVX512F-NEXT: br label [[VECTOR_BODY:%.*]]
-; AVX512F: vector.body:
-; AVX512F-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; AVX512F-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; AVX512F-NEXT: [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
-; AVX512F-NEXT: [[VEC_PHI4:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
-; AVX512F-NEXT: [[VEC_PHI5:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP26:%.*]], [[VECTOR_BODY]] ]
-; AVX512F-NEXT: [[VEC_PHI6:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[VECTOR_BODY]] ]
-; AVX512F-NEXT: [[STEP_ADD:%.*]] = add <8 x i64> [[VEC_IND]], <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
-; AVX512F-NEXT: [[STEP_ADD1:%.*]] = add <8 x i64> [[STEP_ADD]], <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
-; AVX512F-NEXT: [[STEP_ADD2:%.*]] = add <8 x i64> [[STEP_ADD1]], <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
-; AVX512F-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; AVX512F-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8
-; AVX512F-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 16
-; AVX512F-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 24
-; AVX512F-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 [[TMP0]]
-; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP1]]
-; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP2]]
-; AVX512F-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP3]]
-; AVX512F-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, i64* [[TMP4]], i32 0
-; AVX512F-NEXT: [[TMP9:%.*]] = bitcast i64* [[TMP8]] to <8 x i64>*
-; AVX512F-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i64>, <8 x i64>* [[TMP9]], align 8
-; AVX512F-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, i64* [[TMP4]], i32 8
-; AVX512F-NEXT: [[TMP11:%.*]] = bitcast i64* [[TMP10]] to <8 x i64>*
-; AVX512F-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x i64>, <8 x i64>* [[TMP11]], align 8
-; AVX512F-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, i64* [[TMP4]], i32 16
-; AVX512F-NEXT: [[TMP13:%.*]] = bitcast i64* [[TMP12]] to <8 x i64>*
-; AVX512F-NEXT: [[WIDE_LOAD8:%.*]] = load <8 x i64>, <8 x i64>* [[TMP13]], align 8
-; AVX512F-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, i64* [[TMP4]], i32 24
-; AVX512F-NEXT: [[TMP15:%.*]] = bitcast i64* [[TMP14]] to <8 x i64>*
-; AVX512F-NEXT: [[WIDE_LOAD9:%.*]] = load <8 x i64>, <8 x i64>* [[TMP15]], align 8
-; AVX512F-NEXT: [[TMP16:%.*]] = add nsw <8 x i64> [[WIDE_LOAD]], [[VEC_IND]]
-; AVX512F-NEXT: [[TMP17:%.*]] = add nsw <8 x i64> [[WIDE_LOAD7]], [[STEP_ADD]]
-; AVX512F-NEXT: [[TMP18:%.*]] = add nsw <8 x i64> [[WIDE_LOAD8]], [[STEP_ADD1]]
-; AVX512F-NEXT: [[TMP19:%.*]] = add nsw <8 x i64> [[WIDE_LOAD9]], [[STEP_ADD2]]
-; AVX512F-NEXT: [[TMP20:%.*]] = bitcast i64* [[TMP8]] to <8 x i64>*
-; AVX512F-NEXT: store <8 x i64> [[TMP16]], <8 x i64>* [[TMP20]], align 8
-; AVX512F-NEXT: [[TMP21:%.*]] = bitcast i64* [[TMP10]] to <8 x i64>*
-; AVX512F-NEXT: store <8 x i64> [[TMP17]], <8 x i64>* [[TMP21]], align 8
-; AVX512F-NEXT: [[TMP22:%.*]] = bitcast i64* [[TMP12]] to <8 x i64>*
-; AVX512F-NEXT: store <8 x i64> [[TMP18]], <8 x i64>* [[TMP22]], align 8
-; AVX512F-NEXT: [[TMP23:%.*]] = bitcast i64* [[TMP14]] to <8 x i64>*
-; AVX512F-NEXT: store <8 x i64> [[TMP19]], <8 x i64>* [[TMP23]], align 8
-; AVX512F-NEXT: [[TMP24]] = add <8 x i64> [[TMP16]], [[VEC_PHI]]
-; AVX512F-NEXT: [[TMP25]] = add <8 x i64> [[TMP17]], [[VEC_PHI4]]
-; AVX512F-NEXT: [[TMP26]] = add <8 x i64> [[TMP18]], [[VEC_PHI5]]
-; AVX512F-NEXT: [[TMP27]] = add <8 x i64> [[TMP19]], [[VEC_PHI6]]
-; AVX512F-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; AVX512F-NEXT: [[VEC_IND_NEXT]] = add <8 x i64> [[STEP_ADD2]], <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>
-; AVX512F-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; AVX512F-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; AVX512F: middle.block:
-; AVX512F-NEXT: [[BIN_RDX:%.*]] = add <8 x i64> [[TMP25]], [[TMP24]]
-; AVX512F-NEXT: [[BIN_RDX10:%.*]] = add <8 x i64> [[TMP26]], [[BIN_RDX]]
-; AVX512F-NEXT: [[BIN_RDX11:%.*]] = add <8 x i64> [[TMP27]], [[BIN_RDX10]]
-; AVX512F-NEXT: [[TMP29:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX11]])
-; AVX512F-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, 1024
-; AVX512F-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; AVX512F: scalar.ph:
-; AVX512F-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; AVX512F-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP29]], [[MIDDLE_BLOCK]] ]
-; AVX512F-NEXT: br label [[FOR_BODY:%.*]]
-; AVX512F: for.cond.cleanup:
-; AVX512F-NEXT: [[ADD2_LCSSA:%.*]] = phi i64 [ [[ADD2:%.*]], [[FOR_BODY]] ], [ [[TMP29]], [[MIDDLE_BLOCK]] ]
-; AVX512F-NEXT: ret i64 [[ADD2_LCSSA]]
-; AVX512F: for.body:
-; AVX512F-NEXT: [[I_012:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; AVX512F-NEXT: [[S_011:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD2]], [[FOR_BODY]] ]
-; AVX512F-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[I_012]]
-; AVX512F-NEXT: [[TMP30:%.*]] = load i64, i64* [[ARRAYIDX]], align 8
-; AVX512F-NEXT: [[ADD:%.*]] = add nsw i64 [[TMP30]], [[I_012]]
-; AVX512F-NEXT: store i64 [[ADD]], i64* [[ARRAYIDX]], align 8
-; AVX512F-NEXT: [[ADD2]] = add nsw i64 [[ADD]], [[S_011]]
-; AVX512F-NEXT: [[INC]] = add nuw nsw i64 [[I_012]], 1
-; AVX512F-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INC]], 1024
-; AVX512F-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-;
+; CHECK-LABEL: bar
+; CHECK: LV(REG): VF = 2
+; CHECK-NEXT: LV(REG): Found max usage: 2 item
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 3 registers
+; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers
+; CHECK-NEXT: LV(REG): Found invariant usage: 0 item
entry:
br label %for.body
; For c[i] = e[d[i]] in the loop, e[d[i]] is not consecutive but its index %tmp can
; be gathered into a vector. For VF == 16, the vector version of %tmp will be <16 x i64>
; so the max usage of AVX512 vector register will be 2.
-; CHECK-LABEL: @hoo(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* @d, i64 0, i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP:%.*]] = load i64, i64* [[ARRAYIDX]], align 8
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @e, i64 0, i64 [[TMP]]
-; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4
-; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @c, i64 0, i64 [[INDVARS_IV]]
-; CHECK-NEXT: store i32 [[TMP1]], i32* [[ARRAYIDX3]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
-; AVX512F-LABEL: @hoo(
-; AVX512F-NEXT: iter.check:
-; AVX512F-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; AVX512F: vector.main.loop.iter.check:
-; AVX512F-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
-; AVX512F: vector.ph:
-; AVX512F-NEXT: br label [[VECTOR_BODY:%.*]]
-; AVX512F: vector.body:
-; AVX512F-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; AVX512F-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; AVX512F-NEXT: [[TMP1:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* @d, i64 0, i64 [[TMP0]]
-; AVX512F-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, i64* [[TMP1]], i32 0
-; AVX512F-NEXT: [[TMP3:%.*]] = bitcast i64* [[TMP2]] to <16 x i64>*
-; AVX512F-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i64>, <16 x i64>* [[TMP3]], align 8
-; AVX512F-NEXT: [[TMP4:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @e, i64 0, <16 x i64> [[WIDE_LOAD]]
-; AVX512F-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP4]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x i32> undef)
-; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @c, i64 0, i64 [[TMP0]]
-; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
-; AVX512F-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <16 x i32>*
-; AVX512F-NEXT: store <16 x i32> [[WIDE_MASKED_GATHER]], <16 x i32>* [[TMP7]], align 4
-; AVX512F-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; AVX512F-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000
-; AVX512F-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; AVX512F: middle.block:
-; AVX512F-NEXT: [[CMP_N:%.*]] = icmp eq i64 10000, 10000
-; AVX512F-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
-; AVX512F: vec.epilog.iter.check:
-; AVX512F-NEXT: br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
-; AVX512F: vec.epilog.ph:
-; AVX512F-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; AVX512F-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
-; AVX512F: vec.epilog.vector.body:
-; AVX512F-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT5:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; AVX512F-NEXT: [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 0
-; AVX512F-NEXT: [[TMP10:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* @d, i64 0, i64 [[TMP9]]
-; AVX512F-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, i64* [[TMP10]], i32 0
-; AVX512F-NEXT: [[TMP12:%.*]] = bitcast i64* [[TMP11]] to <8 x i64>*
-; AVX512F-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i64>, <8 x i64>* [[TMP12]], align 8
-; AVX512F-NEXT: [[TMP13:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @e, i64 0, <8 x i64> [[WIDE_LOAD3]]
-; AVX512F-NEXT: [[WIDE_MASKED_GATHER4:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP13]], i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
-; AVX512F-NEXT: [[TMP14:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @c, i64 0, i64 [[TMP9]]
-; AVX512F-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i32 0
-; AVX512F-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <8 x i32>*
-; AVX512F-NEXT: store <8 x i32> [[WIDE_MASKED_GATHER4]], <8 x i32>* [[TMP16]], align 4
-; AVX512F-NEXT: [[INDEX_NEXT5]] = add nuw i64 [[OFFSET_IDX]], 8
-; AVX512F-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT5]], 10000
-; AVX512F-NEXT: br i1 [[TMP17]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; AVX512F: vec.epilog.middle.block:
-; AVX512F-NEXT: [[CMP_N1:%.*]] = icmp eq i64 10000, 10000
-; AVX512F-NEXT: br i1 [[CMP_N1]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
-; AVX512F: vec.epilog.scalar.ph:
-; AVX512F-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 10000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 10000, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
-; AVX512F-NEXT: br label [[FOR_BODY:%.*]]
-; AVX512F: for.body:
-; AVX512F-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; AVX512F-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* @d, i64 0, i64 [[INDVARS_IV]]
-; AVX512F-NEXT: [[TMP:%.*]] = load i64, i64* [[ARRAYIDX]], align 8
-; AVX512F-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @e, i64 0, i64 [[TMP]]
-; AVX512F-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4
-; AVX512F-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds [0 x i32], [0 x i32]* @c, i64 0, i64 [[INDVARS_IV]]
-; AVX512F-NEXT: store i32 [[TMP1]], i32* [[ARRAYIDX3]], align 4
-; AVX512F-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; AVX512F-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 10000
-; AVX512F-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; AVX512F: for.end.loopexit:
-; AVX512F-NEXT: br label [[FOR_END]]
-; AVX512F: for.end:
-; AVX512F-NEXT: ret void
-;
+; AVX512F-LABEL: bar
+; AVX512F: LV(REG): VF = 16
; AVX512F-CHECK: LV(REG): Found max usage: 2 item
; AVX512F-CHECK: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers
; AVX512F-CHECK: LV(REG): RegisterClass: Generic::VectorRC, 2 registers
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -loop-vectorize -instcombine -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -S | FileCheck %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
define void @test1() {
-; CHECK-LABEL: @test1(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[ALLOCA:%.*]] = alloca float, align 4
-; CHECK-NEXT: br label [[LOOP_EXIT_DIM_11_CRITEDGE:%.*]]
-; CHECK: loop_exit.dim.11.critedge:
-; CHECK-NEXT: [[PTRINT:%.*]] = ptrtoint float* [[ALLOCA]] to i64
-; CHECK-NEXT: [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 4
-; CHECK-NEXT: [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
-; CHECK-NEXT: br label [[LOOP_HEADER_DIM_017_PREHEADER:%.*]]
-; CHECK: loop_header.dim.017.preheader.loopexit:
-; CHECK-NEXT: br label [[LOOP_HEADER_DIM_017_PREHEADER]]
-; CHECK: loop_header.dim.017.preheader:
-; CHECK-NEXT: br label [[LOOP_BODY_DIM_018:%.*]]
-; CHECK: loop_body.dim.018:
-; CHECK-NEXT: [[INVAR_ADDRESS_DIM_019_0135:%.*]] = phi i64 [ 0, [[LOOP_HEADER_DIM_017_PREHEADER]] ], [ [[TMP0:%.*]], [[LOOP_BODY_DIM_018]] ]
-; CHECK-NEXT: call void @llvm.assume(i1 [[MASKCOND]])
-; CHECK-NEXT: [[TMP0]] = add nuw nsw i64 [[INVAR_ADDRESS_DIM_019_0135]], 1
-; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 [[TMP0]], 256
-; CHECK-NEXT: br i1 [[TMP1]], label [[LOOP_HEADER_DIM_017_PREHEADER_LOOPEXIT:%.*]], label [[LOOP_BODY_DIM_018]]
-;
entry:
%alloca = alloca float, align 4
br label %loop_exit.dim.11.critedge
loop_body.dim.018: ; preds = %loop_body.dim.018, %loop_header.dim.017.preheader
%invar_address.dim.019.0135 = phi i64 [ 0, %loop_header.dim.017.preheader ], [ %0, %loop_body.dim.018 ]
call void @llvm.assume(i1 %maskcond)
+; CHECK: call void @llvm.assume(
+; CHECK-NOT: call void @llvm.assume(
%0 = add nuw nsw i64 %invar_address.dim.019.0135, 1
%1 = icmp eq i64 %0, 256
br i1 %1, label %loop_header.dim.017.preheader, label %loop_body.dim.018
; Function Attrs: norecurse nounwind ssp uwtable
define void @_Z3fn1v() #0 {
; CHECK-LABEL: @_Z3fn1v(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* @c, align 4
-; CHECK-NEXT: [[CMP34:%.*]] = icmp sgt i32 [[TMP0]], 8
-; CHECK-NEXT: br i1 [[CMP34]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK: for.body.lr.ph:
-; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* @a, align 4
-; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[TMP1]], 0
-; CHECK-NEXT: [[TMP2:%.*]] = load i64, i64* @b, align 8
-; CHECK-NEXT: [[MUL:%.*]] = mul i64 [[TMP2]], 4063299859190
-; CHECK-NEXT: [[TOBOOL6:%.*]] = icmp eq i64 [[MUL]], 0
-; CHECK-NEXT: [[TMP3:%.*]] = sext i32 [[TMP0]] to i64
-; CHECK-NEXT: br i1 [[TOBOOL]], label [[FOR_BODY_US_PREHEADER:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
-; CHECK: for.body.preheader:
-; CHECK-NEXT: [[TMP4:%.*]] = add nsw i64 [[TMP3]], -9
-; CHECK-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP4]], 1
-; CHECK-NEXT: [[TMP6:%.*]] = add nuw i64 [[TMP5]], 1
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP6]], 16
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP6]], 16
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP6]], [[N_MOD_VF]]
-; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[N_VEC]], 2
-; CHECK-NEXT: [[IND_END:%.*]] = add i64 8, [[TMP7]]
-; CHECK-NEXT: [[IND_END2:%.*]] = mul i64 [[N_VEC]], 2
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30, i64 32, i64 34, i64 36, i64 38>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND3:%.*]] = phi <16 x i64> [ <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT4:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP8:%.*]] = sub nsw <16 x i64> <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>, [[VEC_IND]]
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, <16 x i64> [[VEC_IND]]
-; CHECK-NEXT: [[TMP10:%.*]] = add nsw <16 x i64> [[TMP8]], [[VEC_IND3]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [10 x i32], <16 x [10 x i32]*> [[TMP9]], <16 x i64> [[TMP10]], i64 0
-; CHECK-NEXT: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>, <16 x i32*> [[TMP11]], i32 16, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
-; CHECK-NEXT: [[TMP12:%.*]] = or <16 x i64> [[VEC_IND3]], <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
-; CHECK-NEXT: [[TMP13:%.*]] = add nsw <16 x i64> [[TMP8]], [[TMP12]]
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [10 x i32], <16 x [10 x i32]*> [[TMP9]], <16 x i64> [[TMP13]], i64 0
-; CHECK-NEXT: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>, <16 x i32*> [[TMP14]], i32 8, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30, i64 32, i64 34, i64 36, i64 38>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT: [[VEC_IND3:%.*]] = phi <16 x i64> [ <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30>, %vector.ph ], [ [[VEC_IND_NEXT4:%.*]], %vector.body ]
+; CHECK-NEXT: [[TMP10:%.*]] = sub nsw <16 x i64> <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>, [[VEC_IND]]
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, <16 x i64> [[VEC_IND]]
+; CHECK-NEXT: [[TMP12:%.*]] = add nsw <16 x i64> [[TMP10]], [[VEC_IND3]]
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [10 x i32], <16 x [10 x i32]*> [[TMP11]], <16 x i64> [[TMP12]], i64 0
+; CHECK-NEXT: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>, <16 x i32*> [[TMP13]], i32 16, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT: [[TMP14:%.*]] = or <16 x i64> [[VEC_IND3]], <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT: [[TMP15:%.*]] = add nsw <16 x i64> [[TMP10]], [[TMP14]]
+; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [10 x i32], <16 x [10 x i32]*> [[TMP11]], <16 x i64> [[TMP15]], i64 0
+; CHECK-NEXT: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>, <16 x i32*> [[TMP16]], i32 8, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
; CHECK-NEXT: [[VEC_IND_NEXT4]] = add <16 x i64> [[VEC_IND3]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
-; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP6]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT99:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 8, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body.us.preheader:
-; CHECK-NEXT: [[TMP16:%.*]] = add nsw i64 [[TMP3]], -9
-; CHECK-NEXT: [[TMP17:%.*]] = lshr i64 [[TMP16]], 1
-; CHECK-NEXT: [[TMP18:%.*]] = add nuw i64 [[TMP17]], 1
-; CHECK-NEXT: [[MIN_ITERS_CHECK8:%.*]] = icmp ult i64 [[TMP18]], 16
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK8]], label [[SCALAR_PH6:%.*]], label [[VECTOR_PH9:%.*]]
-; CHECK: vector.ph9:
-; CHECK-NEXT: [[N_MOD_VF10:%.*]] = urem i64 [[TMP18]], 16
-; CHECK-NEXT: [[N_VEC11:%.*]] = sub i64 [[TMP18]], [[N_MOD_VF10]]
-; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[N_VEC11]], 2
-; CHECK-NEXT: [[IND_END13:%.*]] = add i64 8, [[TMP19]]
-; CHECK-NEXT: [[IND_END15:%.*]] = mul i64 [[N_VEC11]], 2
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i1> poison, i1 [[TOBOOL6]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i1> [[BROADCAST_SPLATINSERT]], <16 x i1> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT: br label [[VECTOR_BODY7:%.*]]
-; CHECK: vector.body7:
-; CHECK-NEXT: [[INDEX17:%.*]] = phi i64 [ 0, [[VECTOR_PH9]] ], [ [[INDEX_NEXT22:%.*]], [[VECTOR_BODY7]] ]
-; CHECK-NEXT: [[VEC_IND18:%.*]] = phi <16 x i64> [ <i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30, i64 32, i64 34, i64 36, i64 38>, [[VECTOR_PH9]] ], [ [[VEC_IND_NEXT19:%.*]], [[VECTOR_BODY7]] ]
-; CHECK-NEXT: [[VEC_IND20:%.*]] = phi <16 x i64> [ <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30>, [[VECTOR_PH9]] ], [ [[VEC_IND_NEXT21:%.*]], [[VECTOR_BODY7]] ]
-; CHECK-NEXT: [[TMP20:%.*]] = sub nsw <16 x i64> <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>, [[VEC_IND18]]
-; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, <16 x i64> [[VEC_IND18]]
-; CHECK-NEXT: [[TMP22:%.*]] = add nsw <16 x i64> [[TMP20]], [[VEC_IND20]]
-; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds [10 x i32], <16 x [10 x i32]*> [[TMP21]], <16 x i64> [[TMP22]], i64 0
-; CHECK-NEXT: [[TMP24:%.*]] = xor <16 x i1> [[BROADCAST_SPLAT]], <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>, <16 x i32*> [[TMP23]], i32 16, <16 x i1> [[TMP24]])
-; CHECK-NEXT: [[TMP25:%.*]] = or <16 x i64> [[VEC_IND20]], <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
-; CHECK-NEXT: [[TMP26:%.*]] = add nsw <16 x i64> [[TMP20]], [[TMP25]]
-; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds [10 x i32], <16 x [10 x i32]*> [[TMP21]], <16 x i64> [[TMP26]], i64 0
-; CHECK-NEXT: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>, <16 x i32*> [[TMP27]], i32 8, <16 x i1> [[TMP24]])
-; CHECK-NEXT: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>, <16 x i32*> [[TMP23]], i32 16, <16 x i1> [[BROADCAST_SPLAT]])
-; CHECK-NEXT: [[TMP28:%.*]] = or <16 x i64> [[VEC_IND20]], <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
-; CHECK-NEXT: [[TMP29:%.*]] = add nsw <16 x i64> [[TMP20]], [[TMP28]]
-; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds [10 x i32], <16 x [10 x i32]*> [[TMP21]], <16 x i64> [[TMP29]], i64 0
-; CHECK-NEXT: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>, <16 x i32*> [[TMP30]], i32 8, <16 x i1> [[BROADCAST_SPLAT]])
-; CHECK-NEXT: [[INDEX_NEXT22]] = add nuw i64 [[INDEX17]], 16
-; CHECK-NEXT: [[VEC_IND_NEXT19]] = add <16 x i64> [[VEC_IND18]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
-; CHECK-NEXT: [[VEC_IND_NEXT21]] = add <16 x i64> [[VEC_IND20]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
-; CHECK-NEXT: [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT22]], [[N_VEC11]]
-; CHECK-NEXT: br i1 [[TMP31]], label [[MIDDLE_BLOCK5:%.*]], label [[VECTOR_BODY7]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: middle.block5:
-; CHECK-NEXT: [[CMP_N16:%.*]] = icmp eq i64 [[TMP18]], [[N_VEC11]]
-; CHECK-NEXT: br i1 [[CMP_N16]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH6]]
-; CHECK: scalar.ph6:
-; CHECK-NEXT: [[BC_RESUME_VAL12:%.*]] = phi i64 [ [[IND_END13]], [[MIDDLE_BLOCK5]] ], [ 8, [[FOR_BODY_US_PREHEADER]] ]
-; CHECK-NEXT: [[BC_RESUME_VAL14:%.*]] = phi i64 [ [[IND_END15]], [[MIDDLE_BLOCK5]] ], [ 0, [[FOR_BODY_US_PREHEADER]] ]
-; CHECK-NEXT: br label [[FOR_BODY_US:%.*]]
-; CHECK: for.body.us:
-; CHECK-NEXT: [[INDVARS_IV78:%.*]] = phi i64 [ [[INDVARS_IV_NEXT79:%.*]], [[FOR_COND_CLEANUP4_US_LCSSA_US_US:%.*]] ], [ [[BC_RESUME_VAL12]], [[SCALAR_PH6]] ]
-; CHECK-NEXT: [[INDVARS_IV70:%.*]] = phi i64 [ [[INDVARS_IV_NEXT71:%.*]], [[FOR_COND_CLEANUP4_US_LCSSA_US_US]] ], [ [[BC_RESUME_VAL14]], [[SCALAR_PH6]] ]
-; CHECK-NEXT: [[TMP32:%.*]] = sub nsw i64 8, [[INDVARS_IV78]]
-; CHECK-NEXT: [[ADD_PTR_US:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[INDVARS_IV78]]
-; CHECK-NEXT: [[TMP33:%.*]] = add nsw i64 [[TMP32]], [[INDVARS_IV70]]
-; CHECK-NEXT: [[ARRAYDECAY_US_US_US:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ADD_PTR_US]], i64 [[TMP33]], i64 0
-; CHECK-NEXT: br i1 [[TOBOOL6]], label [[FOR_BODY5_US_US_US_PREHEADER:%.*]], label [[FOR_BODY5_US_US48_PREHEADER:%.*]]
-; CHECK: for.body5.us.us48.preheader:
-; CHECK-NEXT: store i32 8, i32* [[ARRAYDECAY_US_US_US]], align 16
-; CHECK-NEXT: [[INDVARS_IV_NEXT66:%.*]] = or i64 [[INDVARS_IV70]], 1
-; CHECK-NEXT: [[TMP34:%.*]] = add nsw i64 [[TMP32]], [[INDVARS_IV_NEXT66]]
-; CHECK-NEXT: [[ARRAYDECAY_US_US55_1:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ADD_PTR_US]], i64 [[TMP34]], i64 0
-; CHECK-NEXT: store i32 8, i32* [[ARRAYDECAY_US_US55_1]], align 8
-; CHECK-NEXT: br label [[FOR_COND_CLEANUP4_US_LCSSA_US_US]]
-; CHECK: for.body5.us.us.us.preheader:
-; CHECK-NEXT: store i32 7, i32* [[ARRAYDECAY_US_US_US]], align 16
-; CHECK-NEXT: [[INDVARS_IV_NEXT73:%.*]] = or i64 [[INDVARS_IV70]], 1
-; CHECK-NEXT: [[TMP35:%.*]] = add nsw i64 [[TMP32]], [[INDVARS_IV_NEXT73]]
-; CHECK-NEXT: [[ARRAYDECAY_US_US_US_1:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ADD_PTR_US]], i64 [[TMP35]], i64 0
-; CHECK-NEXT: store i32 7, i32* [[ARRAYDECAY_US_US_US_1]], align 8
-; CHECK-NEXT: br label [[FOR_COND_CLEANUP4_US_LCSSA_US_US]]
-; CHECK: for.cond.cleanup4.us-lcssa.us.us:
-; CHECK-NEXT: [[INDVARS_IV_NEXT79]] = add nuw nsw i64 [[INDVARS_IV78]], 2
-; CHECK-NEXT: [[CMP_US:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT79]], [[TMP3]]
-; CHECK-NEXT: [[INDVARS_IV_NEXT71]] = add nuw nsw i64 [[INDVARS_IV70]], 2
-; CHECK-NEXT: br i1 [[CMP_US]], label [[FOR_BODY_US]], label [[FOR_COND_CLEANUP_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK: for.cond.cleanup.loopexit:
-; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
-; CHECK: for.cond.cleanup.loopexit99:
-; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: ret void
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV95:%.*]] = phi i64 [ [[INDVARS_IV_NEXT96:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[INDVARS_IV87:%.*]] = phi i64 [ [[INDVARS_IV_NEXT88:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[TMP36:%.*]] = sub nsw i64 8, [[INDVARS_IV95]]
-; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[INDVARS_IV95]]
-; CHECK-NEXT: [[TMP37:%.*]] = add nsw i64 [[TMP36]], [[INDVARS_IV87]]
-; CHECK-NEXT: [[ARRAYDECAY_US31:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ADD_PTR]], i64 [[TMP37]], i64 0
-; CHECK-NEXT: store i32 8, i32* [[ARRAYDECAY_US31]], align 16
-; CHECK-NEXT: [[INDVARS_IV_NEXT90:%.*]] = or i64 [[INDVARS_IV87]], 1
-; CHECK-NEXT: [[TMP38:%.*]] = add nsw i64 [[TMP36]], [[INDVARS_IV_NEXT90]]
-; CHECK-NEXT: [[ARRAYDECAY_US31_1:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[ADD_PTR]], i64 [[TMP38]], i64 0
-; CHECK-NEXT: store i32 8, i32* [[ARRAYDECAY_US31_1]], align 8
-; CHECK-NEXT: [[INDVARS_IV_NEXT96]] = add nuw nsw i64 [[INDVARS_IV95]], 2
-; CHECK-NEXT: [[CMP:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT96]], [[TMP3]]
-; CHECK-NEXT: [[INDVARS_IV_NEXT88]] = add nuw nsw i64 [[INDVARS_IV87]], 2
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT99]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body
;
entry:
%0 = load i32, i32* @c, align 4
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -loop-vectorize -mtriple=x86_64-unknown-linux -S -mcpu=slm -debug 2>&1 | FileCheck -check-prefix=MSG %s
; REQUIRES: asserts
; This test should not be vectorized in X86\SLM arch
target triple = "x86_64-unknown-linux-gnu"
define i32 @no_vec(i32 %LastIndex, i16* nocapture readonly %InputData, i16 signext %lag, i16 signext %Scale) {
-; MSG-LABEL: @no_vec(
-; MSG-NEXT: entry:
-; MSG-NEXT: [[CMP17:%.*]] = icmp sgt i32 [[LASTINDEX:%.*]], 0
-; MSG-NEXT: br i1 [[CMP17]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; MSG: for.body.lr.ph:
-; MSG-NEXT: [[CONV5:%.*]] = sext i16 [[SCALE:%.*]] to i64
-; MSG-NEXT: [[SH_PROM:%.*]] = and i64 [[CONV5]], 4294967295
-; MSG-NEXT: [[TMP0:%.*]] = sext i16 [[LAG:%.*]] to i64
-; MSG-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[LASTINDEX]] to i64
-; MSG-NEXT: br label [[FOR_BODY:%.*]]
-; MSG: for.cond.cleanup.loopexit:
-; MSG-NEXT: [[ADD7_LCSSA:%.*]] = phi i64 [ [[ADD7:%.*]], [[FOR_BODY]] ]
-; MSG-NEXT: [[CONV8:%.*]] = trunc i64 [[ADD7_LCSSA]] to i32
-; MSG-NEXT: br label [[FOR_COND_CLEANUP]]
-; MSG: for.cond.cleanup:
-; MSG-NEXT: [[ACCUMULATOR_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[CONV8]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
-; MSG-NEXT: ret i32 [[ACCUMULATOR_0_LCSSA]]
-; MSG: for.body:
-; MSG-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; MSG-NEXT: [[ACCUMULATOR_018:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[ADD7]], [[FOR_BODY]] ]
-; MSG-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[INPUTDATA:%.*]], i64 [[INDVARS_IV]]
-; MSG-NEXT: [[TMP1:%.*]] = load i16, i16* [[ARRAYIDX]], align 2
-; MSG-NEXT: [[CONV:%.*]] = sext i16 [[TMP1]] to i64
-; MSG-NEXT: [[TMP2:%.*]] = add nsw i64 [[INDVARS_IV]], [[TMP0]]
-; MSG-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i16, i16* [[INPUTDATA]], i64 [[TMP2]]
-; MSG-NEXT: [[TMP3:%.*]] = load i16, i16* [[ARRAYIDX3]], align 2
-; MSG-NEXT: [[CONV4:%.*]] = sext i16 [[TMP3]] to i64
-; MSG-NEXT: [[MUL:%.*]] = mul nsw i64 [[CONV4]], [[CONV]]
-; MSG-NEXT: [[SHR:%.*]] = ashr i64 [[MUL]], [[SH_PROM]]
-; MSG-NEXT: [[ADD7]] = add i64 [[SHR]], [[ACCUMULATOR_018]]
-; MSG-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; MSG-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; MSG-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
-;
entry:
+; MSG: LV: Selecting VF: 1.
%cmp17 = icmp sgt i32 %LastIndex, 0
br i1 %cmp17, label %for.body.lr.ph, label %for.cond.cleanup
%conv = sext i16 %1 to i64
%2 = add nsw i64 %indvars.iv, %0
%arrayidx3 = getelementptr inbounds i16, i16* %InputData, i64 %2
- %3 = load i16, i16* %arrayidx3, align 2
+ %3 = load i16, i16* %arrayidx3, align 2
%conv4 = sext i16 %3 to i64
%mul = mul nsw i64 %conv4, %conv
%shr = ashr i64 %mul, %sh_prom
- %add7 = add i64 %shr, %Accumulator.018
+ %add7 = add i64 %shr, %Accumulator.018
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
%exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -vector-library=SVML -inject-tli-mappings -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -mattr=avx -S < %s | FileCheck %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
define void @sin_f64(double* nocapture %varray) {
; CHECK-LABEL: @sin_f64(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double>
-; CHECK-NEXT: [[TMP2:%.*]] = call <4 x double> @__svml_sin4(<4 x double> [[TMP1]])
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP4]] to <4 x double>*
-; CHECK-NEXT: store <4 x double> [[TMP2]], <4 x double>* [[TMP5]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT: [[CALL:%.*]] = tail call double @sin(double [[CONV]]) #[[ATTR3:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT: store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_sin4(<4 x double> [[TMP4:%.*]])
+; CHECK: ret void
;
entry:
br label %for.body
define void @sin_f32(float* nocapture %varray) {
; CHECK-LABEL: @sin_f32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @__svml_sinf4(<4 x float> [[TMP1]])
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT: [[CALL:%.*]] = tail call float @sinf(float [[CONV]]) #[[ATTR4:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT: store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_sinf4(<4 x float> [[TMP4:%.*]])
+; CHECK: ret void
;
entry:
br label %for.body
define void @sin_f64_intrinsic(double* nocapture %varray) {
; CHECK-LABEL: @sin_f64_intrinsic(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double>
-; CHECK-NEXT: [[TMP2:%.*]] = call <4 x double> @__svml_sin4(<4 x double> [[TMP1]])
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP4]] to <4 x double>*
-; CHECK-NEXT: store <4 x double> [[TMP2]], <4 x double>* [[TMP5]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT: [[CALL:%.*]] = tail call double @llvm.sin.f64(double [[CONV]]) #[[ATTR5:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT: store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_sin4(<4 x double> [[TMP4:%.*]])
+; CHECK: ret void
;
entry:
br label %for.body
define void @sin_f32_intrinsic(float* nocapture %varray) {
; CHECK-LABEL: @sin_f32_intrinsic(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @__svml_sinf4(<4 x float> [[TMP1]])
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT: [[CALL:%.*]] = tail call float @llvm.sin.f32(float [[CONV]]) #[[ATTR6:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT: store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_sinf4(<4 x float> [[TMP4:%.*]])
+; CHECK: ret void
;
entry:
br label %for.body
define void @cos_f64(double* nocapture %varray) {
; CHECK-LABEL: @cos_f64(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double>
-; CHECK-NEXT: [[TMP2:%.*]] = call <4 x double> @__svml_cos4(<4 x double> [[TMP1]])
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP4]] to <4 x double>*
-; CHECK-NEXT: store <4 x double> [[TMP2]], <4 x double>* [[TMP5]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT: [[CALL:%.*]] = tail call double @cos(double [[CONV]]) #[[ATTR7:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT: store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_cos4(<4 x double> [[TMP4:%.*]])
+; CHECK: ret void
;
entry:
br label %for.body
define void @cos_f32(float* nocapture %varray) {
; CHECK-LABEL: @cos_f32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @__svml_cosf4(<4 x float> [[TMP1]])
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT: [[CALL:%.*]] = tail call float @cosf(float [[CONV]]) #[[ATTR8:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT: store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_cosf4(<4 x float> [[TMP4:%.*]])
+; CHECK: ret void
;
entry:
br label %for.body
define void @cos_f64_intrinsic(double* nocapture %varray) {
; CHECK-LABEL: @cos_f64_intrinsic(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double>
-; CHECK-NEXT: [[TMP2:%.*]] = call <4 x double> @__svml_cos4(<4 x double> [[TMP1]])
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP4]] to <4 x double>*
-; CHECK-NEXT: store <4 x double> [[TMP2]], <4 x double>* [[TMP5]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT: [[CALL:%.*]] = tail call double @llvm.cos.f64(double [[CONV]]) #[[ATTR9:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT: store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_cos4(<4 x double> [[TMP4:%.*]])
+; CHECK: ret void
;
entry:
br label %for.body
define void @cos_f32_intrinsic(float* nocapture %varray) {
; CHECK-LABEL: @cos_f32_intrinsic(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @__svml_cosf4(<4 x float> [[TMP1]])
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT: [[CALL:%.*]] = tail call float @llvm.cos.f32(float [[CONV]]) #[[ATTR10:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT: store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_cosf4(<4 x float> [[TMP4:%.*]])
+; CHECK: ret void
;
entry:
br label %for.body
define void @pow_f64(double* nocapture %varray, double* nocapture readonly %exp) {
; CHECK-LABEL: @pow_f64(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[VARRAY1:%.*]] = bitcast double* [[VARRAY:%.*]] to i8*
-; CHECK-NEXT: [[EXP3:%.*]] = bitcast double* [[EXP:%.*]] to i8*
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK: vector.memcheck:
-; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr double, double* [[VARRAY]], i64 1000
-; CHECK-NEXT: [[SCEVGEP2:%.*]] = bitcast double* [[SCEVGEP]] to i8*
-; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr double, double* [[EXP]], i64 1000
-; CHECK-NEXT: [[SCEVGEP45:%.*]] = bitcast double* [[SCEVGEP4]] to i8*
-; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[VARRAY1]], [[SCEVGEP45]]
-; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[EXP3]], [[SCEVGEP2]]
-; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double>
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, double* [[EXP]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[TMP3]] to <4 x double>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x double>, <4 x double>* [[TMP4]], align 4, !alias.scope !18
-; CHECK-NEXT: [[TMP5:%.*]] = call <4 x double> @__svml_pow4(<4 x double> [[TMP1]], <4 x double> [[WIDE_LOAD]])
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds double, double* [[TMP6]], i32 0
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to <4 x double>*
-; CHECK-NEXT: store <4 x double> [[TMP5]], <4 x double>* [[TMP8]], align 4, !alias.scope !21, !noalias !18
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[EXP]], i64 [[IV]]
-; CHECK-NEXT: [[TMP1:%.*]] = load double, double* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = tail call double @pow(double [[CONV]], double [[TMP1]]) #[[ATTR11:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT: store double [[TMP2]], double* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK: [[TMP8:%.*]] = call <4 x double> @__svml_pow4(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]])
+; CHECK: ret void
;
entry:
br label %for.body
define void @pow_f64_intrinsic(double* nocapture %varray, double* nocapture readonly %exp) {
; CHECK-LABEL: @pow_f64_intrinsic(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[VARRAY1:%.*]] = bitcast double* [[VARRAY:%.*]] to i8*
-; CHECK-NEXT: [[EXP3:%.*]] = bitcast double* [[EXP:%.*]] to i8*
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK: vector.memcheck:
-; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr double, double* [[VARRAY]], i64 1000
-; CHECK-NEXT: [[SCEVGEP2:%.*]] = bitcast double* [[SCEVGEP]] to i8*
-; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr double, double* [[EXP]], i64 1000
-; CHECK-NEXT: [[SCEVGEP45:%.*]] = bitcast double* [[SCEVGEP4]] to i8*
-; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[VARRAY1]], [[SCEVGEP45]]
-; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[EXP3]], [[SCEVGEP2]]
-; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double>
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, double* [[EXP]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[TMP3]] to <4 x double>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x double>, <4 x double>* [[TMP4]], align 4, !alias.scope !25
-; CHECK-NEXT: [[TMP5:%.*]] = call <4 x double> @__svml_pow4(<4 x double> [[TMP1]], <4 x double> [[WIDE_LOAD]])
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds double, double* [[TMP6]], i32 0
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP7]] to <4 x double>*
-; CHECK-NEXT: store <4 x double> [[TMP5]], <4 x double>* [[TMP8]], align 4, !alias.scope !28, !noalias !25
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[EXP]], i64 [[IV]]
-; CHECK-NEXT: [[TMP1:%.*]] = load double, double* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = tail call double @llvm.pow.f64(double [[CONV]], double [[TMP1]]) #[[ATTR12:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT: store double [[TMP2]], double* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK: [[TMP8:%.*]] = call <4 x double> @__svml_pow4(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]])
+; CHECK: ret void
;
entry:
br label %for.body
define void @pow_f32(float* nocapture %varray, float* nocapture readonly %exp) {
; CHECK-LABEL: @pow_f32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[VARRAY1:%.*]] = bitcast float* [[VARRAY:%.*]] to i8*
-; CHECK-NEXT: [[EXP3:%.*]] = bitcast float* [[EXP:%.*]] to i8*
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK: vector.memcheck:
-; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[VARRAY]], i64 1000
-; CHECK-NEXT: [[SCEVGEP2:%.*]] = bitcast float* [[SCEVGEP]] to i8*
-; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr float, float* [[EXP]], i64 1000
-; CHECK-NEXT: [[SCEVGEP45:%.*]] = bitcast float* [[SCEVGEP4]] to i8*
-; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[VARRAY1]], [[SCEVGEP45]]
-; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[EXP3]], [[SCEVGEP2]]
-; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[EXP]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[TMP3]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4, !alias.scope !32
-; CHECK-NEXT: [[TMP5:%.*]] = call <4 x float> @__svml_powf4(<4 x float> [[TMP1]], <4 x float> [[WIDE_LOAD]])
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 0
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP7]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP5]], <4 x float>* [[TMP8]], align 4, !alias.scope !35, !noalias !32
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[EXP]], i64 [[IV]]
-; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = tail call float @powf(float [[CONV]], float [[TMP1]]) #[[ATTR13:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT: store float [[TMP2]], float* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK: [[TMP8:%.*]] = call <4 x float> @__svml_powf4(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]])
+; CHECK: ret void
;
entry:
br label %for.body
define void @pow_f32_intrinsic(float* nocapture %varray, float* nocapture readonly %exp) {
; CHECK-LABEL: @pow_f32_intrinsic(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[VARRAY1:%.*]] = bitcast float* [[VARRAY:%.*]] to i8*
-; CHECK-NEXT: [[EXP3:%.*]] = bitcast float* [[EXP:%.*]] to i8*
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK: vector.memcheck:
-; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[VARRAY]], i64 1000
-; CHECK-NEXT: [[SCEVGEP2:%.*]] = bitcast float* [[SCEVGEP]] to i8*
-; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr float, float* [[EXP]], i64 1000
-; CHECK-NEXT: [[SCEVGEP45:%.*]] = bitcast float* [[SCEVGEP4]] to i8*
-; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[VARRAY1]], [[SCEVGEP45]]
-; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[EXP3]], [[SCEVGEP2]]
-; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[EXP]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[TMP3]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4, !alias.scope !39
-; CHECK-NEXT: [[TMP5:%.*]] = call <4 x float> @__svml_powf4(<4 x float> [[TMP1]], <4 x float> [[WIDE_LOAD]])
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 0
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[TMP7]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP5]], <4 x float>* [[TMP8]], align 4, !alias.scope !42, !noalias !39
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[EXP]], i64 [[IV]]
-; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = tail call float @llvm.pow.f32(float [[CONV]], float [[TMP1]]) #[[ATTR14:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT: store float [[TMP2]], float* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP45:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK: [[TMP8:%.*]] = call <4 x float> @__svml_powf4(<4 x float> [[TMP4:%.*]], <4 x float> [[WIDE_LOAD:%.*]])
+; CHECK: ret void
;
entry:
br label %for.body
define void @exp_f64(double* nocapture %varray) {
; CHECK-LABEL: @exp_f64(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double>
-; CHECK-NEXT: [[TMP2:%.*]] = call <4 x double> @__svml_exp4(<4 x double> [[TMP1]])
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP4]] to <4 x double>*
-; CHECK-NEXT: store <4 x double> [[TMP2]], <4 x double>* [[TMP5]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP46:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT: [[CALL:%.*]] = tail call double @exp(double [[CONV]]) #[[ATTR15:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT: store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP47:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_exp4(<4 x double> [[TMP4:%.*]])
+; CHECK: ret void
;
entry:
br label %for.body
define void @exp_f32(float* nocapture %varray) {
; CHECK-LABEL: @exp_f32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @__svml_expf4(<4 x float> [[TMP1]])
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP48:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT: [[CALL:%.*]] = tail call float @expf(float [[CONV]]) #[[ATTR16:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT: store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP49:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_expf4(<4 x float> [[TMP4:%.*]])
+; CHECK: ret void
;
entry:
br label %for.body
define void @exp_f64_intrinsic(double* nocapture %varray) {
; CHECK-LABEL: @exp_f64_intrinsic(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double>
-; CHECK-NEXT: [[TMP2:%.*]] = call <4 x double> @__svml_exp4(<4 x double> [[TMP1]])
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP4]] to <4 x double>*
-; CHECK-NEXT: store <4 x double> [[TMP2]], <4 x double>* [[TMP5]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP50:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT: [[CALL:%.*]] = tail call double @llvm.exp.f64(double [[CONV]]) #[[ATTR17:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT: store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP51:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_exp4(<4 x double> [[TMP4:%.*]])
+; CHECK: ret void
;
entry:
br label %for.body
define void @exp_f32_intrinsic(float* nocapture %varray) {
; CHECK-LABEL: @exp_f32_intrinsic(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @__svml_expf4(<4 x float> [[TMP1]])
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP52:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT: [[CALL:%.*]] = tail call float @llvm.exp.f32(float [[CONV]]) #[[ATTR18:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT: store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP53:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_expf4(<4 x float> [[TMP4:%.*]])
+; CHECK: ret void
;
entry:
br label %for.body
define void @log_f64(double* nocapture %varray) {
; CHECK-LABEL: @log_f64(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double>
-; CHECK-NEXT: [[TMP2:%.*]] = call <4 x double> @__svml_log4(<4 x double> [[TMP1]])
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP4]] to <4 x double>*
-; CHECK-NEXT: store <4 x double> [[TMP2]], <4 x double>* [[TMP5]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP54:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT: [[CALL:%.*]] = tail call double @log(double [[CONV]]) #[[ATTR19:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT: store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP55:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_log4(<4 x double> [[TMP4:%.*]])
+; CHECK: ret void
;
entry:
br label %for.body
define void @log_f32(float* nocapture %varray) {
; CHECK-LABEL: @log_f32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @__svml_logf4(<4 x float> [[TMP1]])
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP56:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT: [[CALL:%.*]] = tail call float @logf(float [[CONV]]) #[[ATTR20:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT: store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP57:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_logf4(<4 x float> [[TMP4:%.*]])
+; CHECK: ret void
;
entry:
br label %for.body
define void @log_f64_intrinsic(double* nocapture %varray) {
; CHECK-LABEL: @log_f64_intrinsic(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double>
-; CHECK-NEXT: [[TMP2:%.*]] = call <4 x double> @__svml_log4(<4 x double> [[TMP1]])
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP4]] to <4 x double>*
-; CHECK-NEXT: store <4 x double> [[TMP2]], <4 x double>* [[TMP5]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP58:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT: [[CALL:%.*]] = tail call double @llvm.log.f64(double [[CONV]]) #[[ATTR21:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT: store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP59:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_log4(<4 x double> [[TMP4:%.*]])
+; CHECK: ret void
;
entry:
br label %for.body
define void @log_f32_intrinsic(float* nocapture %varray) {
; CHECK-LABEL: @log_f32_intrinsic(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @__svml_logf4(<4 x float> [[TMP1]])
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP60:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT: [[CALL:%.*]] = tail call float @llvm.log.f32(float [[CONV]]) #[[ATTR22:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT: store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP61:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_logf4(<4 x float> [[TMP4:%.*]])
+; CHECK: ret void
;
entry:
br label %for.body
define void @log2_f64(double* nocapture %varray) {
; CHECK-LABEL: @log2_f64(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double>
-; CHECK-NEXT: [[TMP2:%.*]] = call <4 x double> @__svml_log24(<4 x double> [[TMP1]])
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP4]] to <4 x double>*
-; CHECK-NEXT: store <4 x double> [[TMP2]], <4 x double>* [[TMP5]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP62:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT: [[CALL:%.*]] = tail call double @log2(double [[CONV]]) #[[ATTR23:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT: store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP63:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_log24(<4 x double> [[TMP4:%.*]])
+; CHECK: ret void
;
entry:
br label %for.body
define void @log2_f32(float* nocapture %varray) {
; CHECK-LABEL: @log2_f32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @__svml_log2f4(<4 x float> [[TMP1]])
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP64:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT: [[CALL:%.*]] = tail call float @log2f(float [[CONV]]) #[[ATTR24:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT: store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP65:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_log2f4(<4 x float> [[TMP4:%.*]])
+; CHECK: ret void
;
entry:
br label %for.body
define void @log2_f64_intrinsic(double* nocapture %varray) {
; CHECK-LABEL: @log2_f64_intrinsic(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double>
-; CHECK-NEXT: [[TMP2:%.*]] = call <4 x double> @__svml_log24(<4 x double> [[TMP1]])
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP4]] to <4 x double>*
-; CHECK-NEXT: store <4 x double> [[TMP2]], <4 x double>* [[TMP5]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP66:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT: [[CALL:%.*]] = tail call double @llvm.log2.f64(double [[CONV]]) #[[ATTR25:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT: store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP67:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_log24(<4 x double> [[TMP4:%.*]])
+; CHECK: ret void
;
entry:
br label %for.body
define void @log2_f32_intrinsic(float* nocapture %varray) {
; CHECK-LABEL: @log2_f32_intrinsic(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @__svml_log2f4(<4 x float> [[TMP1]])
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP68:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT: [[CALL:%.*]] = tail call float @llvm.log2.f32(float [[CONV]]) #[[ATTR26:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT: store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP69:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_log2f4(<4 x float> [[TMP4:%.*]])
+; CHECK: ret void
;
entry:
br label %for.body
define void @log10_f64(double* nocapture %varray) {
; CHECK-LABEL: @log10_f64(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double>
-; CHECK-NEXT: [[TMP2:%.*]] = call <4 x double> @__svml_log104(<4 x double> [[TMP1]])
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP4]] to <4 x double>*
-; CHECK-NEXT: store <4 x double> [[TMP2]], <4 x double>* [[TMP5]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP70:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT: [[CALL:%.*]] = tail call double @log10(double [[CONV]]) #[[ATTR27:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT: store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP71:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_log104(<4 x double> [[TMP4:%.*]])
+; CHECK: ret void
;
entry:
br label %for.body
define void @log10_f32(float* nocapture %varray) {
; CHECK-LABEL: @log10_f32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @__svml_log10f4(<4 x float> [[TMP1]])
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP72:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT: [[CALL:%.*]] = tail call float @log10f(float [[CONV]]) #[[ATTR28:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT: store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP73:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_log10f4(<4 x float> [[TMP4:%.*]])
+; CHECK: ret void
;
entry:
br label %for.body
define void @log10_f64_intrinsic(double* nocapture %varray) {
; CHECK-LABEL: @log10_f64_intrinsic(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double>
-; CHECK-NEXT: [[TMP2:%.*]] = call <4 x double> @__svml_log104(<4 x double> [[TMP1]])
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP4]] to <4 x double>*
-; CHECK-NEXT: store <4 x double> [[TMP2]], <4 x double>* [[TMP5]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP74:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT: [[CALL:%.*]] = tail call double @llvm.log10.f64(double [[CONV]]) #[[ATTR29:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT: store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP75:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_log104(<4 x double> [[TMP4:%.*]])
+; CHECK: ret void
;
entry:
br label %for.body
define void @log10_f32_intrinsic(float* nocapture %varray) {
; CHECK-LABEL: @log10_f32_intrinsic(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @__svml_log10f4(<4 x float> [[TMP1]])
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP76:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT: [[CALL:%.*]] = tail call float @llvm.log10.f32(float [[CONV]]) #[[ATTR30:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT: store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP77:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_log10f4(<4 x float> [[TMP4:%.*]])
+; CHECK: ret void
;
entry:
br label %for.body
define void @sqrt_f64(double* nocapture %varray) {
; CHECK-LABEL: @sqrt_f64(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double>
-; CHECK-NEXT: [[TMP2:%.*]] = call <4 x double> @__svml_sqrt4(<4 x double> [[TMP1]])
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP4]] to <4 x double>*
-; CHECK-NEXT: store <4 x double> [[TMP2]], <4 x double>* [[TMP5]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP78:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT: [[CALL:%.*]] = tail call double @sqrt(double [[CONV]]) #[[ATTR31:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT: store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP79:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_sqrt4(<4 x double> [[TMP4:%.*]])
+; CHECK: ret void
;
entry:
br label %for.body
define void @sqrt_f32(float* nocapture %varray) {
; CHECK-LABEL: @sqrt_f32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @__svml_sqrtf4(<4 x float> [[TMP1]])
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP80:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT: [[CALL:%.*]] = tail call float @sqrtf(float [[CONV]]) #[[ATTR32:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT: store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP81:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_sqrtf4(<4 x float> [[TMP4:%.*]])
+; CHECK: ret void
;
entry:
br label %for.body
define void @exp2_f64(double* nocapture %varray) {
; CHECK-LABEL: @exp2_f64(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double>
-; CHECK-NEXT: [[TMP2:%.*]] = call <4 x double> @__svml_exp24(<4 x double> [[TMP1]])
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP4]] to <4 x double>*
-; CHECK-NEXT: store <4 x double> [[TMP2]], <4 x double>* [[TMP5]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP82:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT: [[CALL:%.*]] = tail call double @exp2(double [[CONV]]) #[[ATTR33:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT: store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP83:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_exp24(<4 x double> [[TMP4:%.*]])
+; CHECK: ret void
;
entry:
br label %for.body
define void @exp2_f32(float* nocapture %varray) {
; CHECK-LABEL: @exp2_f32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @__svml_exp2f4(<4 x float> [[TMP1]])
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP84:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT: [[CALL:%.*]] = tail call float @exp2f(float [[CONV]]) #[[ATTR34:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT: store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP85:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_exp2f4(<4 x float> [[TMP4:%.*]])
+; CHECK: ret void
;
entry:
br label %for.body
define void @exp2_f64_intrinsic(double* nocapture %varray) {
; CHECK-LABEL: @exp2_f64_intrinsic(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x double>
-; CHECK-NEXT: [[TMP2:%.*]] = call <4 x double> @__svml_exp24(<4 x double> [[TMP1]])
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, double* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP4]] to <4 x double>*
-; CHECK-NEXT: store <4 x double> [[TMP2]], <4 x double>* [[TMP5]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP86:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to double
-; CHECK-NEXT: [[CALL:%.*]] = tail call double @llvm.exp2.f64(double [[CONV]]) #[[ATTR35:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT: store double [[CALL]], double* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP87:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_exp24(<4 x double> [[TMP4:%.*]])
+; CHECK: ret void
;
entry:
br label %for.body
define void @exp2_f32_intrinsic(float* nocapture %varray) {
; CHECK-LABEL: @exp2_f32_intrinsic(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[VEC_IND]] to <4 x float>
-; CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @__svml_exp2f4(<4 x float> [[TMP1]])
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[VARRAY:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP88:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP]] to float
-; CHECK-NEXT: [[CALL:%.*]] = tail call float @llvm.exp2.f32(float [[CONV]]) #[[ATTR36:[0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[VARRAY]], i64 [[IV]]
-; CHECK-NEXT: store float [[CALL]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP89:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
+; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_exp2f4(<4 x float> [[TMP4:%.*]])
+; CHECK: ret void
;
entry:
br label %for.body
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[INDEX]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT: [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
-; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IV]], <i64 429, i64 429, i64 429, i64 429, i64 429, i64 429, i64 429, i64 429>
+; CHECK-NEXT: [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <8 x i64> [[INDUCTION]], <i64 429, i64 429, i64 429, i64 429, i64 429, i64 429, i64 429, i64 429>
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]]
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0
; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <8 x i32>*
; CHECK-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP8]], <8 x i32>* [[TMP11]], i32 4, <8 x i1> [[TMP1]])
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 432
-; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 430
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], [[LOOP2:!llvm.loop !.*]]
;
entry:
br label %for.body
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[INDEX]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT: [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
-; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IV]], <i64 429, i64 429, i64 429, i64 429, i64 429, i64 429, i64 429, i64 429>
+; CHECK-NEXT: [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <8 x i64> [[INDUCTION]], <i64 429, i64 429, i64 429, i64 429, i64 429, i64 429, i64 429, i64 429>
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]]
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0
; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <8 x i32>*
; CHECK-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP8]], <8 x i32>* [[TMP11]], i32 4, <8 x i1> [[TMP1]])
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 432
-; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 430
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], [[LOOP5:!llvm.loop !.*]]
;
entry:
br label %for.body
; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i64> poison, i64 [[INDEX]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT1]], <8 x i64> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT: [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
-; CHECK-NEXT: [[TMP4:%.*]] = icmp ule <8 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+; CHECK-NEXT: [[TMP4:%.*]] = icmp ule <8 x i64> [[INDUCTION]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP3]]
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
; CHECK-NEXT: [[TMP13:%.*]] = select <8 x i1> [[TMP4]], <8 x i32> [[TMP12]], <8 x i32> [[VEC_PHI]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP13]])
; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; CHECK-NEXT: [[SUM_1]] = add nuw nsw i32 [[ADD]], [[SUM_0]]
; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], [[LOOP7:!llvm.loop !.*]]
; CHECK: for.cond.cleanup:
; CHECK-NEXT: [[SUM_1_LCSSA:%.*]] = phi i32 [ [[SUM_1]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i32 [[SUM_1_LCSSA]]
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -S -debug-only=loop-vectorize 2>&1 | FileCheck %s
; REQUIRES: asserts
; CHECK: cost of 5 for VF 2 For instruction: %conv = uitofp i64 %tmp to double
; CHECK: cost of 10 for VF 4 For instruction: %conv = uitofp i64 %tmp to double
define void @uint64_to_double_cost(i64* noalias nocapture %a, double* noalias nocapture readonly %b) nounwind {
-; CHECK-LABEL: @uint64_to_double_cost(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8
-; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 12
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP2]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP3]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, i64* [[TMP4]], i32 0
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64* [[TMP8]] to <4 x i64>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, <4 x i64>* [[TMP9]], align 4
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, i64* [[TMP4]], i32 4
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64* [[TMP10]] to <4 x i64>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, <4 x i64>* [[TMP11]], align 4
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, i64* [[TMP4]], i32 8
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast i64* [[TMP12]] to <4 x i64>*
-; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i64>, <4 x i64>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i64, i64* [[TMP4]], i32 12
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast i64* [[TMP14]] to <4 x i64>*
-; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i64>, <4 x i64>* [[TMP15]], align 4
-; CHECK-NEXT: [[TMP16:%.*]] = uitofp <4 x i64> [[WIDE_LOAD]] to <4 x double>
-; CHECK-NEXT: [[TMP17:%.*]] = uitofp <4 x i64> [[WIDE_LOAD1]] to <4 x double>
-; CHECK-NEXT: [[TMP18:%.*]] = uitofp <4 x i64> [[WIDE_LOAD2]] to <4 x double>
-; CHECK-NEXT: [[TMP19:%.*]] = uitofp <4 x i64> [[WIDE_LOAD3]] to <4 x double>
-; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP2]]
-; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP3]]
-; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds double, double* [[TMP20]], i32 0
-; CHECK-NEXT: [[TMP25:%.*]] = bitcast double* [[TMP24]] to <4 x double>*
-; CHECK-NEXT: store <4 x double> [[TMP16]], <4 x double>* [[TMP25]], align 4
-; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds double, double* [[TMP20]], i32 4
-; CHECK-NEXT: [[TMP27:%.*]] = bitcast double* [[TMP26]] to <4 x double>*
-; CHECK-NEXT: store <4 x double> [[TMP17]], <4 x double>* [[TMP27]], align 4
-; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[TMP20]], i32 8
-; CHECK-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP28]] to <4 x double>*
-; CHECK-NEXT: store <4 x double> [[TMP18]], <4 x double>* [[TMP29]], align 4
-; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds double, double* [[TMP20]], i32 12
-; CHECK-NEXT: [[TMP31:%.*]] = bitcast double* [[TMP30]] to <4 x double>*
-; CHECK-NEXT: store <4 x double> [[TMP19]], <4 x double>* [[TMP31]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; CHECK-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 256, 256
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 256, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP:%.*]] = load i64, i64* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[CONV:%.*]] = uitofp i64 [[TMP]] to double
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store double [[CONV]], double* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 256
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
entry:
br label %for.body
for.body:
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ADDR:%.*]], align 4
-; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ADDR]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ADDR]], align 4
-; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[ADDR]], align 4
+; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[ADDR:%.*]], align 4
+; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[ADDR]], align 4
+; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[ADDR]], align 4
+; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[ADDR]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4097, 4096
; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]]
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096
; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
; CHECK: loopexit:
-; CHECK-NEXT: [[LOAD_LCSSA:%.*]] = phi i32 [ [[LOAD]], [[FOR_BODY]] ], [ [[TMP3]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[LOAD_LCSSA:%.*]] = phi i32 [ [[LOAD]], [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i32 [[LOAD_LCSSA]]
;
entry:
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ADDR:%.*]], align 4
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[ADDR:%.*]], align 4
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ADDR]], align 4
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[ADDR]], align 4
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i32> poison, i32 [[TMP5]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT4]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ADDR]], align 4
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 0
+; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[ADDR]], align 4
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP6]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT6]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[ADDR]], align 4
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i32 0
+; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[ADDR]], align 4
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP4]] = add <4 x i32> [[VEC_PHI]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: [[TMP5]] = add <4 x i32> [[VEC_PHI1]], [[BROADCAST_SPLAT5]]
-; CHECK-NEXT: [[TMP6]] = add <4 x i32> [[VEC_PHI2]], [[BROADCAST_SPLAT7]]
-; CHECK-NEXT: [[TMP7]] = add <4 x i32> [[VEC_PHI3]], [[BROADCAST_SPLAT9]]
+; CHECK-NEXT: [[TMP8]] = add <4 x i32> [[VEC_PHI]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP9]] = add <4 x i32> [[VEC_PHI1]], [[BROADCAST_SPLAT5]]
+; CHECK-NEXT: [[TMP10]] = add <4 x i32> [[VEC_PHI2]], [[BROADCAST_SPLAT7]]
+; CHECK-NEXT: [[TMP11]] = add <4 x i32> [[VEC_PHI3]], [[BROADCAST_SPLAT9]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP5]], [[TMP4]]
-; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP6]], [[BIN_RDX]]
-; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP7]], [[BIN_RDX10]]
-; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
+; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP9]], [[TMP8]]
+; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP10]], [[BIN_RDX]]
+; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP11]], [[BIN_RDX10]]
+; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4097, 4096
; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096
; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: loopexit:
-; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]]
;
entry:
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = udiv i32 [[BYTE_OFFSET:%.*]], 4
-; CHECK-NEXT: [[TMP1:%.*]] = udiv i32 [[BYTE_OFFSET]], 4
-; CHECK-NEXT: [[TMP2:%.*]] = udiv i32 [[BYTE_OFFSET]], 4
-; CHECK-NEXT: [[TMP3:%.*]] = udiv i32 [[BYTE_OFFSET]], 4
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[ADDR:%.*]], i32 [[TMP0]]
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[ADDR]], i32 [[TMP1]]
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[ADDR]], i32 [[TMP2]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[ADDR]], i32 [[TMP3]]
-; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP4]], align 4
-; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP6]], align 4
-; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP7]], align 4
+; CHECK-NEXT: [[TMP4:%.*]] = udiv i32 [[BYTE_OFFSET:%.*]], 4
+; CHECK-NEXT: [[TMP5:%.*]] = udiv i32 [[BYTE_OFFSET]], 4
+; CHECK-NEXT: [[TMP6:%.*]] = udiv i32 [[BYTE_OFFSET]], 4
+; CHECK-NEXT: [[TMP7:%.*]] = udiv i32 [[BYTE_OFFSET]], 4
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[ADDR:%.*]], i32 [[TMP4]]
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[ADDR]], i32 [[TMP5]]
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[ADDR]], i32 [[TMP6]]
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[ADDR]], i32 [[TMP7]]
+; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP8]], align 4
+; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP9]], align 4
+; CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP10]], align 4
+; CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP11]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4097, 4096
; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]]
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096
; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
; CHECK: loopexit:
-; CHECK-NEXT: [[LOAD_LCSSA:%.*]] = phi i32 [ [[LOAD]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[LOAD_LCSSA:%.*]] = phi i32 [ [[LOAD]], [[FOR_BODY]] ], [ [[TMP15]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i32 [[LOAD_LCSSA]]
;
entry:
; CHECK-NEXT: store i32 0, i32* [[ADDR]], align 4
; CHECK-NEXT: store i32 0, i32* [[ADDR]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; CHECK-NEXT: br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4097, 4096
; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[STEP_ADD1:%.*]] = add <4 x i32> [[STEP_ADD]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[STEP_ADD2:%.*]] = add <4 x i32> [[STEP_ADD1]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x i32> [[VEC_IND]], i32 0
-; CHECK-NEXT: store i32 [[TMP0]], i32* [[ADDR:%.*]], align 4
-; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[VEC_IND]], i32 1
-; CHECK-NEXT: store i32 [[TMP1]], i32* [[ADDR]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[VEC_IND]], i32 2
-; CHECK-NEXT: store i32 [[TMP2]], i32* [[ADDR]], align 4
-; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[VEC_IND]], i32 3
-; CHECK-NEXT: store i32 [[TMP3]], i32* [[ADDR]], align 4
-; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i32 0
-; CHECK-NEXT: store i32 [[TMP4]], i32* [[ADDR]], align 4
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i32 1
+; CHECK-NEXT: [[VEC_IND4:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT9:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[STEP_ADD5:%.*]] = add <4 x i32> [[VEC_IND4]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT: [[STEP_ADD6:%.*]] = add <4 x i32> [[STEP_ADD5]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT: [[STEP_ADD7:%.*]] = add <4 x i32> [[STEP_ADD6]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[VEC_IND4]], i32 0
+; CHECK-NEXT: store i32 [[TMP4]], i32* [[ADDR:%.*]], align 4
+; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[VEC_IND4]], i32 1
; CHECK-NEXT: store i32 [[TMP5]], i32* [[ADDR]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i32 2
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[VEC_IND4]], i32 2
; CHECK-NEXT: store i32 [[TMP6]], i32* [[ADDR]], align 4
-; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[STEP_ADD]], i32 3
+; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[VEC_IND4]], i32 3
; CHECK-NEXT: store i32 [[TMP7]], i32* [[ADDR]], align 4
-; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[STEP_ADD1]], i32 0
+; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[STEP_ADD5]], i32 0
; CHECK-NEXT: store i32 [[TMP8]], i32* [[ADDR]], align 4
-; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[STEP_ADD1]], i32 1
+; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[STEP_ADD5]], i32 1
; CHECK-NEXT: store i32 [[TMP9]], i32* [[ADDR]], align 4
-; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[STEP_ADD1]], i32 2
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[STEP_ADD5]], i32 2
; CHECK-NEXT: store i32 [[TMP10]], i32* [[ADDR]], align 4
-; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i32> [[STEP_ADD1]], i32 3
+; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i32> [[STEP_ADD5]], i32 3
; CHECK-NEXT: store i32 [[TMP11]], i32* [[ADDR]], align 4
-; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[STEP_ADD2]], i32 0
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[STEP_ADD6]], i32 0
; CHECK-NEXT: store i32 [[TMP12]], i32* [[ADDR]], align 4
-; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[STEP_ADD2]], i32 1
+; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[STEP_ADD6]], i32 1
; CHECK-NEXT: store i32 [[TMP13]], i32* [[ADDR]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i32> [[STEP_ADD2]], i32 2
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i32> [[STEP_ADD6]], i32 2
; CHECK-NEXT: store i32 [[TMP14]], i32* [[ADDR]], align 4
-; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[STEP_ADD2]], i32 3
+; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[STEP_ADD6]], i32 3
; CHECK-NEXT: store i32 [[TMP15]], i32* [[ADDR]], align 4
+; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i32> [[STEP_ADD7]], i32 0
+; CHECK-NEXT: store i32 [[TMP16]], i32* [[ADDR]], align 4
+; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x i32> [[STEP_ADD7]], i32 1
+; CHECK-NEXT: store i32 [[TMP17]], i32* [[ADDR]], align 4
+; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x i32> [[STEP_ADD7]], i32 2
+; CHECK-NEXT: store i32 [[TMP18]], i32* [[ADDR]], align 4
+; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x i32> [[STEP_ADD7]], i32 3
+; CHECK-NEXT: store i32 [[TMP19]], i32* [[ADDR]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD2]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT: [[VEC_IND_NEXT9]] = add <4 x i32> [[STEP_ADD7]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4097, 4096
; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A]], align 4, !alias.scope !12
-; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[A]], align 4, !alias.scope !12
-; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[A]], align 4, !alias.scope !12
-; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[A]], align 4, !alias.scope !12
-; CHECK-NEXT: store i32 [[TMP0]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
-; CHECK-NEXT: store i32 [[TMP0]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
-; CHECK-NEXT: store i32 [[TMP0]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
-; CHECK-NEXT: store i32 [[TMP0]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
-; CHECK-NEXT: store i32 [[TMP1]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
-; CHECK-NEXT: store i32 [[TMP1]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
-; CHECK-NEXT: store i32 [[TMP1]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
-; CHECK-NEXT: store i32 [[TMP1]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
-; CHECK-NEXT: store i32 [[TMP2]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
-; CHECK-NEXT: store i32 [[TMP2]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
-; CHECK-NEXT: store i32 [[TMP2]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
-; CHECK-NEXT: store i32 [[TMP2]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
-; CHECK-NEXT: store i32 [[TMP3]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
-; CHECK-NEXT: store i32 [[TMP3]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
-; CHECK-NEXT: store i32 [[TMP3]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
-; CHECK-NEXT: store i32 [[TMP3]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
+; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[A]], align 4, !alias.scope !12
+; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[A]], align 4, !alias.scope !12
+; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[A]], align 4, !alias.scope !12
+; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[A]], align 4, !alias.scope !12
+; CHECK-NEXT: store i32 [[TMP4]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
+; CHECK-NEXT: store i32 [[TMP4]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
+; CHECK-NEXT: store i32 [[TMP4]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
+; CHECK-NEXT: store i32 [[TMP4]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
+; CHECK-NEXT: store i32 [[TMP5]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
+; CHECK-NEXT: store i32 [[TMP5]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
+; CHECK-NEXT: store i32 [[TMP5]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
+; CHECK-NEXT: store i32 [[TMP5]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
+; CHECK-NEXT: store i32 [[TMP6]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
+; CHECK-NEXT: store i32 [[TMP6]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
+; CHECK-NEXT: store i32 [[TMP6]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
+; CHECK-NEXT: store i32 [[TMP6]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
+; CHECK-NEXT: store i32 [[TMP7]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
+; CHECK-NEXT: store i32 [[TMP7]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
+; CHECK-NEXT: store i32 [[TMP7]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
+; CHECK-NEXT: store i32 [[TMP7]], i32* [[B]], align 4, !alias.scope !15, !noalias !12
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4097, 4096
; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* @GAddr, align 4
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* @GAddr, align 4
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* @GAddr, align 4
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* @GAddr, align 4
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i32> poison, i32 [[TMP5]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT4]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* @GAddr, align 4
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 0
+; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* @GAddr, align 4
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP6]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT6]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* @GAddr, align 4
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i32 0
+; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* @GAddr, align 4
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP4]] = add <4 x i32> [[VEC_PHI]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: [[TMP5]] = add <4 x i32> [[VEC_PHI1]], [[BROADCAST_SPLAT5]]
-; CHECK-NEXT: [[TMP6]] = add <4 x i32> [[VEC_PHI2]], [[BROADCAST_SPLAT7]]
-; CHECK-NEXT: [[TMP7]] = add <4 x i32> [[VEC_PHI3]], [[BROADCAST_SPLAT9]]
+; CHECK-NEXT: [[TMP8]] = add <4 x i32> [[VEC_PHI]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP9]] = add <4 x i32> [[VEC_PHI1]], [[BROADCAST_SPLAT5]]
+; CHECK-NEXT: [[TMP10]] = add <4 x i32> [[VEC_PHI2]], [[BROADCAST_SPLAT7]]
+; CHECK-NEXT: [[TMP11]] = add <4 x i32> [[VEC_PHI3]], [[BROADCAST_SPLAT9]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP5]], [[TMP4]]
-; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP6]], [[BIN_RDX]]
-; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP7]], [[BIN_RDX10]]
-; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
+; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP9]], [[TMP8]]
+; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP10]], [[BIN_RDX]]
+; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP11]], [[BIN_RDX10]]
+; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4097, 4096
; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096
; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
; CHECK: loopexit:
-; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]]
;
entry:
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* getelementptr (i32, i32* @GAddr, i64 5), align 4
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* getelementptr (i32, i32* @GAddr, i64 5), align 4
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* getelementptr (i32, i32* @GAddr, i64 5), align 4
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* getelementptr (i32, i32* @GAddr, i64 5), align 4
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <4 x i32> poison, i32 [[TMP5]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT4]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* getelementptr (i32, i32* @GAddr, i64 5), align 4
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP2]], i32 0
+; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* getelementptr (i32, i32* @GAddr, i64 5), align 4
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP6]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT6]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr (i32, i32* @GAddr, i64 5), align 4
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i32 0
+; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* getelementptr (i32, i32* @GAddr, i64 5), align 4
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT8]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP4]] = add <4 x i32> [[VEC_PHI]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: [[TMP5]] = add <4 x i32> [[VEC_PHI1]], [[BROADCAST_SPLAT5]]
-; CHECK-NEXT: [[TMP6]] = add <4 x i32> [[VEC_PHI2]], [[BROADCAST_SPLAT7]]
-; CHECK-NEXT: [[TMP7]] = add <4 x i32> [[VEC_PHI3]], [[BROADCAST_SPLAT9]]
+; CHECK-NEXT: [[TMP8]] = add <4 x i32> [[VEC_PHI]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP9]] = add <4 x i32> [[VEC_PHI1]], [[BROADCAST_SPLAT5]]
+; CHECK-NEXT: [[TMP10]] = add <4 x i32> [[VEC_PHI2]], [[BROADCAST_SPLAT7]]
+; CHECK-NEXT: [[TMP11]] = add <4 x i32> [[VEC_PHI3]], [[BROADCAST_SPLAT9]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
-; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096
+; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP5]], [[TMP4]]
-; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP6]], [[BIN_RDX]]
-; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP7]], [[BIN_RDX10]]
-; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
+; CHECK-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP9]], [[TMP8]]
+; CHECK-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP10]], [[BIN_RDX]]
+; CHECK-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP11]], [[BIN_RDX10]]
+; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4097, 4096
; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4096, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 4096
; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
; CHECK: loopexit:
-; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[ACCUM_NEXT_LCSSA:%.*]] = phi i32 [ [[ACCUM_NEXT]], [[FOR_BODY]] ], [ [[TMP13]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i32 [[ACCUM_NEXT_LCSSA]]
;
entry:
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -mtriple=x86_64-apple-darwin -mattr=+sse2 -loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s
; REQUIRES: asserts
; CHECK: 'foo'
; CHECK: LV: Found an estimated cost of 1 for VF 4 For instruction: %shift = ashr i32 %val, %k
define void @foo(i32* nocapture %p, i32 %k) local_unnamed_addr #0 {
-; CHECK-LABEL: @foo(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[K:%.*]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT: [[TMP4:%.*]] = ashr <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 16, 16
-; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[BODY:%.*]]
-; CHECK: body:
-; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXT:%.*]], [[BODY]] ]
-; CHECK-NEXT: [[PTR:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 [[I]]
-; CHECK-NEXT: [[VAL:%.*]] = load i32, i32* [[PTR]], align 4
-; CHECK-NEXT: [[SHIFT:%.*]] = ashr i32 [[VAL]], [[K]]
-; CHECK-NEXT: store i32 [[SHIFT]], i32* [[PTR]], align 4
-; CHECK-NEXT: [[NEXT]] = add nuw nsw i64 [[I]], 1
-; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[NEXT]], 16
-; CHECK-NEXT: br i1 [[CMP]], label [[EXIT]], label [[BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: exit:
-; CHECK-NEXT: ret void
-;
entry:
br label %body
; CHECK-NEXT: store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4, !llvm.access.group !0
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP1:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 20, 16
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
; CHECK-NEXT: store float [[ADD]], float* [[ARRAYIDX2]], align 4, !llvm.access.group !0
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 20
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP4:!llvm.loop !.*]]
; CHECK: for.end:
; CHECK-NEXT: ret void
;
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> poison, i64 [[INDEX]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT: [[VEC_IV:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
-; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <8 x i64> [[VEC_IV]], <i64 19, i64 19, i64 19, i64 19, i64 19, i64 19, i64 19, i64 19>
+; CHECK-NEXT: [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <8 x i64> [[INDUCTION]], <i64 19, i64 19, i64 19, i64 19, i64 19, i64 19, i64 19, i64 19>
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[TMP3]] to <8 x float>*
; CHECK-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP8]], <8 x float>* [[TMP9]], i32 4, <8 x i1> [[TMP1]]), !llvm.access.group !6
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 24
-; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP7:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: store float [[ADD]], float* [[ARRAYIDX2]], align 4, !llvm.access.group !6
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 20
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP9:!llvm.loop !.*]]
; CHECK: for.end:
; CHECK-NEXT: ret void
;
; CHECK-NEXT: store <8 x float> [[TMP7]], <8 x float>* [[TMP8]], align 4, !llvm.access.group !6
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP10:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 16, 16
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
; CHECK-NEXT: store float [[ADD]], float* [[ARRAYIDX2]], align 4, !llvm.access.group !6
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 16
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP11:!llvm.loop !.*]]
; CHECK: for.end:
; CHECK-NEXT: ret void
;
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -mcpu=corei7-avx -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s --check-prefix=CHECK-AVX1
; RUN: opt -loop-vectorize -vectorizer-maximize-bandwidth -mcpu=core-avx2 -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s --check-prefix=CHECK-AVX2
; REQUIRES: asserts
; CHECK-AVX1: LV: Selecting VF: 16.
; CHECK-AVX2: LV: Selecting VF: 32.
define void @foo() {
-; CHECK-AVX1-LABEL: @foo(
-; CHECK-AVX1-NEXT: iter.check:
-; CHECK-AVX1-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; CHECK-AVX1: vector.main.loop.iter.check:
-; CHECK-AVX1-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-AVX1: vector.ph:
-; CHECK-AVX1-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-AVX1: vector.body:
-; CHECK-AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-AVX1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-AVX1-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1000 x i8], [1000 x i8]* @b, i64 0, i64 [[TMP0]]
-; CHECK-AVX1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0
-; CHECK-AVX1-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>*
-; CHECK-AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[TMP3]], align 1
-; CHECK-AVX1-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1000 x i8], [1000 x i8]* @c, i64 0, i64 [[TMP0]]
-; CHECK-AVX1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 0
-; CHECK-AVX1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <16 x i8>*
-; CHECK-AVX1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP6]], align 1
-; CHECK-AVX1-NEXT: [[TMP7:%.*]] = add <16 x i8> [[WIDE_LOAD1]], [[WIDE_LOAD]]
-; CHECK-AVX1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1000 x i8], [1000 x i8]* @a, i64 0, i64 [[TMP0]]
-; CHECK-AVX1-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, i8* [[TMP8]], i32 0
-; CHECK-AVX1-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to <16 x i8>*
-; CHECK-AVX1-NEXT: store <16 x i8> [[TMP7]], <16 x i8>* [[TMP10]], align 1
-; CHECK-AVX1-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* @v, i64 0, i64 [[TMP0]]
-; CHECK-AVX1-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0
-; CHECK-AVX1-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <16 x i32>*
-; CHECK-AVX1-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i32>, <16 x i32>* [[TMP13]], align 4
-; CHECK-AVX1-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* @w, i64 0, i64 [[TMP0]]
-; CHECK-AVX1-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i32 0
-; CHECK-AVX1-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <16 x i32>*
-; CHECK-AVX1-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i32>, <16 x i32>* [[TMP16]], align 4
-; CHECK-AVX1-NEXT: [[TMP17:%.*]] = add nsw <16 x i32> [[WIDE_LOAD3]], [[WIDE_LOAD2]]
-; CHECK-AVX1-NEXT: [[TMP18:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* @u, i64 0, i64 [[TMP0]]
-; CHECK-AVX1-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP18]], i32 0
-; CHECK-AVX1-NEXT: [[TMP20:%.*]] = bitcast i32* [[TMP19]] to <16 x i32>*
-; CHECK-AVX1-NEXT: store <16 x i32> [[TMP17]], <16 x i32>* [[TMP20]], align 4
-; CHECK-AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-AVX1-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 992
-; CHECK-AVX1-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK-AVX1: middle.block:
-; CHECK-AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 992
-; CHECK-AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
-; CHECK-AVX1: vec.epilog.iter.check:
-; CHECK-AVX1-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
-; CHECK-AVX1: vec.epilog.ph:
-; CHECK-AVX1-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 992, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-AVX1-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
-; CHECK-AVX1: vec.epilog.vector.body:
-; CHECK-AVX1-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT10:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-AVX1-NEXT: [[TMP22:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-AVX1-NEXT: [[TMP23:%.*]] = getelementptr inbounds [1000 x i8], [1000 x i8]* @b, i64 0, i64 [[TMP22]]
-; CHECK-AVX1-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, i8* [[TMP23]], i32 0
-; CHECK-AVX1-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP24]] to <8 x i8>*
-; CHECK-AVX1-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i8>, <8 x i8>* [[TMP25]], align 1
-; CHECK-AVX1-NEXT: [[TMP26:%.*]] = getelementptr inbounds [1000 x i8], [1000 x i8]* @c, i64 0, i64 [[TMP22]]
-; CHECK-AVX1-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, i8* [[TMP26]], i32 0
-; CHECK-AVX1-NEXT: [[TMP28:%.*]] = bitcast i8* [[TMP27]] to <8 x i8>*
-; CHECK-AVX1-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x i8>, <8 x i8>* [[TMP28]], align 1
-; CHECK-AVX1-NEXT: [[TMP29:%.*]] = add <8 x i8> [[WIDE_LOAD7]], [[WIDE_LOAD6]]
-; CHECK-AVX1-NEXT: [[TMP30:%.*]] = getelementptr inbounds [1000 x i8], [1000 x i8]* @a, i64 0, i64 [[TMP22]]
-; CHECK-AVX1-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, i8* [[TMP30]], i32 0
-; CHECK-AVX1-NEXT: [[TMP32:%.*]] = bitcast i8* [[TMP31]] to <8 x i8>*
-; CHECK-AVX1-NEXT: store <8 x i8> [[TMP29]], <8 x i8>* [[TMP32]], align 1
-; CHECK-AVX1-NEXT: [[TMP33:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* @v, i64 0, i64 [[TMP22]]
-; CHECK-AVX1-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, i32* [[TMP33]], i32 0
-; CHECK-AVX1-NEXT: [[TMP35:%.*]] = bitcast i32* [[TMP34]] to <8 x i32>*
-; CHECK-AVX1-NEXT: [[WIDE_LOAD8:%.*]] = load <8 x i32>, <8 x i32>* [[TMP35]], align 4
-; CHECK-AVX1-NEXT: [[TMP36:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* @w, i64 0, i64 [[TMP22]]
-; CHECK-AVX1-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, i32* [[TMP36]], i32 0
-; CHECK-AVX1-NEXT: [[TMP38:%.*]] = bitcast i32* [[TMP37]] to <8 x i32>*
-; CHECK-AVX1-NEXT: [[WIDE_LOAD9:%.*]] = load <8 x i32>, <8 x i32>* [[TMP38]], align 4
-; CHECK-AVX1-NEXT: [[TMP39:%.*]] = add nsw <8 x i32> [[WIDE_LOAD9]], [[WIDE_LOAD8]]
-; CHECK-AVX1-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* @u, i64 0, i64 [[TMP22]]
-; CHECK-AVX1-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[TMP40]], i32 0
-; CHECK-AVX1-NEXT: [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <8 x i32>*
-; CHECK-AVX1-NEXT: store <8 x i32> [[TMP39]], <8 x i32>* [[TMP42]], align 4
-; CHECK-AVX1-NEXT: [[INDEX_NEXT10]] = add nuw i64 [[OFFSET_IDX]], 8
-; CHECK-AVX1-NEXT: [[TMP43:%.*]] = icmp eq i64 [[INDEX_NEXT10]], 1000
-; CHECK-AVX1-NEXT: br i1 [[TMP43]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK-AVX1: vec.epilog.middle.block:
-; CHECK-AVX1-NEXT: [[CMP_N4:%.*]] = icmp eq i64 1000, 1000
-; CHECK-AVX1-NEXT: br i1 [[CMP_N4]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
-; CHECK-AVX1: vec.epilog.scalar.ph:
-; CHECK-AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 992, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
-; CHECK-AVX1-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-AVX1: for.cond.cleanup.loopexit:
-; CHECK-AVX1-NEXT: br label [[FOR_COND_CLEANUP]]
-; CHECK-AVX1: for.cond.cleanup:
-; CHECK-AVX1-NEXT: ret void
-; CHECK-AVX1: for.body:
-; CHECK-AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i8], [1000 x i8]* @b, i64 0, i64 [[INDVARS_IV]]
-; CHECK-AVX1-NEXT: [[TMP44:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; CHECK-AVX1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [1000 x i8], [1000 x i8]* @c, i64 0, i64 [[INDVARS_IV]]
-; CHECK-AVX1-NEXT: [[TMP45:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1
-; CHECK-AVX1-NEXT: [[ADD:%.*]] = add i8 [[TMP45]], [[TMP44]]
-; CHECK-AVX1-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [1000 x i8], [1000 x i8]* @a, i64 0, i64 [[INDVARS_IV]]
-; CHECK-AVX1-NEXT: store i8 [[ADD]], i8* [[ARRAYIDX6]], align 1
-; CHECK-AVX1-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* @v, i64 0, i64 [[INDVARS_IV]]
-; CHECK-AVX1-NEXT: [[TMP46:%.*]] = load i32, i32* [[ARRAYIDX8]], align 4
-; CHECK-AVX1-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* @w, i64 0, i64 [[INDVARS_IV]]
-; CHECK-AVX1-NEXT: [[TMP47:%.*]] = load i32, i32* [[ARRAYIDX10]], align 4
-; CHECK-AVX1-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP47]], [[TMP46]]
-; CHECK-AVX1-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* @u, i64 0, i64 [[INDVARS_IV]]
-; CHECK-AVX1-NEXT: store i32 [[ADD11]], i32* [[ARRAYIDX13]], align 4
-; CHECK-AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-AVX1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1000
-; CHECK-AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-;
-; CHECK-AVX2-LABEL: @foo(
-; CHECK-AVX2-NEXT: iter.check:
-; CHECK-AVX2-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; CHECK-AVX2: vector.main.loop.iter.check:
-; CHECK-AVX2-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-AVX2: vector.ph:
-; CHECK-AVX2-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-AVX2: vector.body:
-; CHECK-AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-AVX2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-AVX2-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1000 x i8], [1000 x i8]* @b, i64 0, i64 [[TMP0]]
-; CHECK-AVX2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0
-; CHECK-AVX2-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <32 x i8>*
-; CHECK-AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <32 x i8>, <32 x i8>* [[TMP3]], align 1
-; CHECK-AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1000 x i8], [1000 x i8]* @c, i64 0, i64 [[TMP0]]
-; CHECK-AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 0
-; CHECK-AVX2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <32 x i8>*
-; CHECK-AVX2-NEXT: [[WIDE_LOAD1:%.*]] = load <32 x i8>, <32 x i8>* [[TMP6]], align 1
-; CHECK-AVX2-NEXT: [[TMP7:%.*]] = add <32 x i8> [[WIDE_LOAD1]], [[WIDE_LOAD]]
-; CHECK-AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1000 x i8], [1000 x i8]* @a, i64 0, i64 [[TMP0]]
-; CHECK-AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, i8* [[TMP8]], i32 0
-; CHECK-AVX2-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to <32 x i8>*
-; CHECK-AVX2-NEXT: store <32 x i8> [[TMP7]], <32 x i8>* [[TMP10]], align 1
-; CHECK-AVX2-NEXT: [[TMP11:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* @v, i64 0, i64 [[TMP0]]
-; CHECK-AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0
-; CHECK-AVX2-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <32 x i32>*
-; CHECK-AVX2-NEXT: [[WIDE_LOAD2:%.*]] = load <32 x i32>, <32 x i32>* [[TMP13]], align 4
-; CHECK-AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* @w, i64 0, i64 [[TMP0]]
-; CHECK-AVX2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i32 0
-; CHECK-AVX2-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <32 x i32>*
-; CHECK-AVX2-NEXT: [[WIDE_LOAD3:%.*]] = load <32 x i32>, <32 x i32>* [[TMP16]], align 4
-; CHECK-AVX2-NEXT: [[TMP17:%.*]] = add nsw <32 x i32> [[WIDE_LOAD3]], [[WIDE_LOAD2]]
-; CHECK-AVX2-NEXT: [[TMP18:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* @u, i64 0, i64 [[TMP0]]
-; CHECK-AVX2-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP18]], i32 0
-; CHECK-AVX2-NEXT: [[TMP20:%.*]] = bitcast i32* [[TMP19]] to <32 x i32>*
-; CHECK-AVX2-NEXT: store <32 x i32> [[TMP17]], <32 x i32>* [[TMP20]], align 4
-; CHECK-AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; CHECK-AVX2-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 992
-; CHECK-AVX2-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK-AVX2: middle.block:
-; CHECK-AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 992
-; CHECK-AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
-; CHECK-AVX2: vec.epilog.iter.check:
-; CHECK-AVX2-NEXT: br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
-; CHECK-AVX2: vec.epilog.ph:
-; CHECK-AVX2-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 992, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-AVX2-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
-; CHECK-AVX2: vec.epilog.vector.body:
-; CHECK-AVX2-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT10:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-AVX2-NEXT: [[TMP22:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-AVX2-NEXT: [[TMP23:%.*]] = getelementptr inbounds [1000 x i8], [1000 x i8]* @b, i64 0, i64 [[TMP22]]
-; CHECK-AVX2-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, i8* [[TMP23]], i32 0
-; CHECK-AVX2-NEXT: [[TMP25:%.*]] = bitcast i8* [[TMP24]] to <16 x i8>*
-; CHECK-AVX2-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, <16 x i8>* [[TMP25]], align 1
-; CHECK-AVX2-NEXT: [[TMP26:%.*]] = getelementptr inbounds [1000 x i8], [1000 x i8]* @c, i64 0, i64 [[TMP22]]
-; CHECK-AVX2-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, i8* [[TMP26]], i32 0
-; CHECK-AVX2-NEXT: [[TMP28:%.*]] = bitcast i8* [[TMP27]] to <16 x i8>*
-; CHECK-AVX2-NEXT: [[WIDE_LOAD7:%.*]] = load <16 x i8>, <16 x i8>* [[TMP28]], align 1
-; CHECK-AVX2-NEXT: [[TMP29:%.*]] = add <16 x i8> [[WIDE_LOAD7]], [[WIDE_LOAD6]]
-; CHECK-AVX2-NEXT: [[TMP30:%.*]] = getelementptr inbounds [1000 x i8], [1000 x i8]* @a, i64 0, i64 [[TMP22]]
-; CHECK-AVX2-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, i8* [[TMP30]], i32 0
-; CHECK-AVX2-NEXT: [[TMP32:%.*]] = bitcast i8* [[TMP31]] to <16 x i8>*
-; CHECK-AVX2-NEXT: store <16 x i8> [[TMP29]], <16 x i8>* [[TMP32]], align 1
-; CHECK-AVX2-NEXT: [[TMP33:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* @v, i64 0, i64 [[TMP22]]
-; CHECK-AVX2-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, i32* [[TMP33]], i32 0
-; CHECK-AVX2-NEXT: [[TMP35:%.*]] = bitcast i32* [[TMP34]] to <16 x i32>*
-; CHECK-AVX2-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i32>, <16 x i32>* [[TMP35]], align 4
-; CHECK-AVX2-NEXT: [[TMP36:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* @w, i64 0, i64 [[TMP22]]
-; CHECK-AVX2-NEXT: [[TMP37:%.*]] = getelementptr inbounds i32, i32* [[TMP36]], i32 0
-; CHECK-AVX2-NEXT: [[TMP38:%.*]] = bitcast i32* [[TMP37]] to <16 x i32>*
-; CHECK-AVX2-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i32>, <16 x i32>* [[TMP38]], align 4
-; CHECK-AVX2-NEXT: [[TMP39:%.*]] = add nsw <16 x i32> [[WIDE_LOAD9]], [[WIDE_LOAD8]]
-; CHECK-AVX2-NEXT: [[TMP40:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* @u, i64 0, i64 [[TMP22]]
-; CHECK-AVX2-NEXT: [[TMP41:%.*]] = getelementptr inbounds i32, i32* [[TMP40]], i32 0
-; CHECK-AVX2-NEXT: [[TMP42:%.*]] = bitcast i32* [[TMP41]] to <16 x i32>*
-; CHECK-AVX2-NEXT: store <16 x i32> [[TMP39]], <16 x i32>* [[TMP42]], align 4
-; CHECK-AVX2-NEXT: [[INDEX_NEXT10]] = add nuw i64 [[OFFSET_IDX]], 16
-; CHECK-AVX2-NEXT: [[TMP43:%.*]] = icmp eq i64 [[INDEX_NEXT10]], 992
-; CHECK-AVX2-NEXT: br i1 [[TMP43]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK-AVX2: vec.epilog.middle.block:
-; CHECK-AVX2-NEXT: [[CMP_N4:%.*]] = icmp eq i64 1000, 992
-; CHECK-AVX2-NEXT: br i1 [[CMP_N4]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
-; CHECK-AVX2: vec.epilog.scalar.ph:
-; CHECK-AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 992, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 992, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
-; CHECK-AVX2-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-AVX2: for.cond.cleanup.loopexit:
-; CHECK-AVX2-NEXT: br label [[FOR_COND_CLEANUP]]
-; CHECK-AVX2: for.cond.cleanup:
-; CHECK-AVX2-NEXT: ret void
-; CHECK-AVX2: for.body:
-; CHECK-AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i8], [1000 x i8]* @b, i64 0, i64 [[INDVARS_IV]]
-; CHECK-AVX2-NEXT: [[TMP44:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; CHECK-AVX2-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [1000 x i8], [1000 x i8]* @c, i64 0, i64 [[INDVARS_IV]]
-; CHECK-AVX2-NEXT: [[TMP45:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1
-; CHECK-AVX2-NEXT: [[ADD:%.*]] = add i8 [[TMP45]], [[TMP44]]
-; CHECK-AVX2-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [1000 x i8], [1000 x i8]* @a, i64 0, i64 [[INDVARS_IV]]
-; CHECK-AVX2-NEXT: store i8 [[ADD]], i8* [[ARRAYIDX6]], align 1
-; CHECK-AVX2-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* @v, i64 0, i64 [[INDVARS_IV]]
-; CHECK-AVX2-NEXT: [[TMP46:%.*]] = load i32, i32* [[ARRAYIDX8]], align 4
-; CHECK-AVX2-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* @w, i64 0, i64 [[INDVARS_IV]]
-; CHECK-AVX2-NEXT: [[TMP47:%.*]] = load i32, i32* [[ARRAYIDX10]], align 4
-; CHECK-AVX2-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP47]], [[TMP46]]
-; CHECK-AVX2-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [1000 x i32], [1000 x i32]* @u, i64 0, i64 [[INDVARS_IV]]
-; CHECK-AVX2-NEXT: store i32 [[ADD11]], i32* [[ARRAYIDX13]], align 4
-; CHECK-AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1000
-; CHECK-AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-;
entry:
br label %for.body
; VF chosen should be atmost 16 (not the max possible vector width = 32 for AVX2)
define void @not_too_small_tc(i8* noalias nocapture %A, i8* noalias nocapture readonly %B) {
; CHECK-LABEL: not_too_small_tc
-; CHECK-AVX1-LABEL: @not_too_small_tc(
-; CHECK-AVX1-NEXT: iter.check:
-; CHECK-AVX1-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; CHECK-AVX1: vector.main.loop.iter.check:
-; CHECK-AVX1-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-AVX1: vector.ph:
-; CHECK-AVX1-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-AVX1: vector.body:
-; CHECK-AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-AVX1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-AVX1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i64 [[TMP0]]
-; CHECK-AVX1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0
-; CHECK-AVX1-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>*
-; CHECK-AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[TMP3]], align 4, !llvm.access.group !5
-; CHECK-AVX1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[TMP0]]
-; CHECK-AVX1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 0
-; CHECK-AVX1-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <16 x i8>*
-; CHECK-AVX1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP6]], align 4, !llvm.access.group !5
-; CHECK-AVX1-NEXT: [[TMP7:%.*]] = add <16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; CHECK-AVX1-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP5]] to <16 x i8>*
-; CHECK-AVX1-NEXT: store <16 x i8> [[TMP7]], <16 x i8>* [[TMP8]], align 4, !llvm.access.group !5
-; CHECK-AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-AVX1-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; CHECK-AVX1-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK-AVX1: middle.block:
-; CHECK-AVX1-NEXT: [[CMP_N:%.*]] = icmp eq i64 16, 16
-; CHECK-AVX1-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
-; CHECK-AVX1: vec.epilog.iter.check:
-; CHECK-AVX1-NEXT: br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
-; CHECK-AVX1: vec.epilog.ph:
-; CHECK-AVX1-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-AVX1-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
-; CHECK-AVX1: vec.epilog.vector.body:
-; CHECK-AVX1-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-AVX1-NEXT: [[TMP10:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-AVX1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[B]], i64 [[TMP10]]
-; CHECK-AVX1-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP11]], i32 0
-; CHECK-AVX1-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <8 x i8>*
-; CHECK-AVX1-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i8>, <8 x i8>* [[TMP13]], align 4, !llvm.access.group !5
-; CHECK-AVX1-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[TMP10]]
-; CHECK-AVX1-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, i8* [[TMP14]], i32 0
-; CHECK-AVX1-NEXT: [[TMP16:%.*]] = bitcast i8* [[TMP15]] to <8 x i8>*
-; CHECK-AVX1-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x i8>, <8 x i8>* [[TMP16]], align 4, !llvm.access.group !5
-; CHECK-AVX1-NEXT: [[TMP17:%.*]] = add <8 x i8> [[WIDE_LOAD4]], [[WIDE_LOAD5]]
-; CHECK-AVX1-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP15]] to <8 x i8>*
-; CHECK-AVX1-NEXT: store <8 x i8> [[TMP17]], <8 x i8>* [[TMP18]], align 4, !llvm.access.group !5
-; CHECK-AVX1-NEXT: [[INDEX_NEXT6]] = add nuw i64 [[OFFSET_IDX]], 8
-; CHECK-AVX1-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT6]], 16
-; CHECK-AVX1-NEXT: br i1 [[TMP19]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK-AVX1: vec.epilog.middle.block:
-; CHECK-AVX1-NEXT: [[CMP_N2:%.*]] = icmp eq i64 16, 16
-; CHECK-AVX1-NEXT: br i1 [[CMP_N2]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
-; CHECK-AVX1: vec.epilog.scalar.ph:
-; CHECK-AVX1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
-; CHECK-AVX1-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-AVX1: for.body:
-; CHECK-AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[B]], i64 [[INDVARS_IV]]
-; CHECK-AVX1-NEXT: [[L1:%.*]] = load i8, i8* [[ARRAYIDX]], align 4, !llvm.access.group !5
-; CHECK-AVX1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[INDVARS_IV]]
-; CHECK-AVX1-NEXT: [[L2:%.*]] = load i8, i8* [[ARRAYIDX2]], align 4, !llvm.access.group !5
-; CHECK-AVX1-NEXT: [[ADD:%.*]] = add i8 [[L1]], [[L2]]
-; CHECK-AVX1-NEXT: store i8 [[ADD]], i8* [[ARRAYIDX2]], align 4, !llvm.access.group !5
-; CHECK-AVX1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-AVX1-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 16
-; CHECK-AVX1-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK-AVX1: for.end.loopexit:
-; CHECK-AVX1-NEXT: br label [[FOR_END]]
-; CHECK-AVX1: for.end:
-; CHECK-AVX1-NEXT: ret void
-;
-; CHECK-AVX2-LABEL: @not_too_small_tc(
-; CHECK-AVX2-NEXT: iter.check:
-; CHECK-AVX2-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; CHECK-AVX2: vector.main.loop.iter.check:
-; CHECK-AVX2-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-AVX2: vector.ph:
-; CHECK-AVX2-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-AVX2: vector.body:
-; CHECK-AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-AVX2-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-AVX2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i64 [[TMP0]]
-; CHECK-AVX2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0
-; CHECK-AVX2-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <16 x i8>*
-; CHECK-AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[TMP3]], align 4, !llvm.access.group !5
-; CHECK-AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[TMP0]]
-; CHECK-AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[TMP4]], i32 0
-; CHECK-AVX2-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <16 x i8>*
-; CHECK-AVX2-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP6]], align 4, !llvm.access.group !5
-; CHECK-AVX2-NEXT: [[TMP7:%.*]] = add <16 x i8> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; CHECK-AVX2-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP5]] to <16 x i8>*
-; CHECK-AVX2-NEXT: store <16 x i8> [[TMP7]], <16 x i8>* [[TMP8]], align 4, !llvm.access.group !5
-; CHECK-AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-AVX2-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; CHECK-AVX2-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK-AVX2: middle.block:
-; CHECK-AVX2-NEXT: [[CMP_N:%.*]] = icmp eq i64 16, 16
-; CHECK-AVX2-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
-; CHECK-AVX2: vec.epilog.iter.check:
-; CHECK-AVX2-NEXT: br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
-; CHECK-AVX2: vec.epilog.ph:
-; CHECK-AVX2-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-AVX2-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
-; CHECK-AVX2: vec.epilog.vector.body:
-; CHECK-AVX2-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-AVX2-NEXT: [[TMP10:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-AVX2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[B]], i64 [[TMP10]]
-; CHECK-AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP11]], i32 0
-; CHECK-AVX2-NEXT: [[TMP13:%.*]] = bitcast i8* [[TMP12]] to <8 x i8>*
-; CHECK-AVX2-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i8>, <8 x i8>* [[TMP13]], align 4, !llvm.access.group !5
-; CHECK-AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[TMP10]]
-; CHECK-AVX2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, i8* [[TMP14]], i32 0
-; CHECK-AVX2-NEXT: [[TMP16:%.*]] = bitcast i8* [[TMP15]] to <8 x i8>*
-; CHECK-AVX2-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x i8>, <8 x i8>* [[TMP16]], align 4, !llvm.access.group !5
-; CHECK-AVX2-NEXT: [[TMP17:%.*]] = add <8 x i8> [[WIDE_LOAD4]], [[WIDE_LOAD5]]
-; CHECK-AVX2-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP15]] to <8 x i8>*
-; CHECK-AVX2-NEXT: store <8 x i8> [[TMP17]], <8 x i8>* [[TMP18]], align 4, !llvm.access.group !5
-; CHECK-AVX2-NEXT: [[INDEX_NEXT6]] = add nuw i64 [[OFFSET_IDX]], 8
-; CHECK-AVX2-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT6]], 16
-; CHECK-AVX2-NEXT: br i1 [[TMP19]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK-AVX2: vec.epilog.middle.block:
-; CHECK-AVX2-NEXT: [[CMP_N2:%.*]] = icmp eq i64 16, 16
-; CHECK-AVX2-NEXT: br i1 [[CMP_N2]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
-; CHECK-AVX2: vec.epilog.scalar.ph:
-; CHECK-AVX2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 16, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
-; CHECK-AVX2-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-AVX2: for.body:
-; CHECK-AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[B]], i64 [[INDVARS_IV]]
-; CHECK-AVX2-NEXT: [[L1:%.*]] = load i8, i8* [[ARRAYIDX]], align 4, !llvm.access.group !5
-; CHECK-AVX2-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[INDVARS_IV]]
-; CHECK-AVX2-NEXT: [[L2:%.*]] = load i8, i8* [[ARRAYIDX2]], align 4, !llvm.access.group !5
-; CHECK-AVX2-NEXT: [[ADD:%.*]] = add i8 [[L1]], [[L2]]
-; CHECK-AVX2-NEXT: store i8 [[ADD]], i8* [[ARRAYIDX2]], align 4, !llvm.access.group !5
-; CHECK-AVX2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-AVX2-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 16
-; CHECK-AVX2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK-AVX2: for.end.loopexit:
-; CHECK-AVX2-NEXT: br label [[FOR_END]]
-; CHECK-AVX2: for.end:
-; CHECK-AVX2-NEXT: ret void
-;
+; CHECK-AVX2: LV: Selecting VF: 16.
entry:
br label %for.body
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -loop-vectorize -mtriple=x86_64-unknown-linux -S -pass-remarks='loop-vectorize' 2>&1 | FileCheck -check-prefix=VECTORIZED %s
; RUN: opt < %s -loop-vectorize -force-vector-width=1 -force-vector-interleave=4 -mtriple=x86_64-unknown-linux -S -pass-remarks='loop-vectorize' 2>&1 | FileCheck -check-prefix=UNROLLED %s
; RUN: opt < %s -loop-vectorize -force-vector-width=1 -force-vector-interleave=1 -mtriple=x86_64-unknown-linux -S -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck -check-prefix=NONE %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
define i32 @foo(i32 %n) #0 !dbg !4 {
-; VECTORIZED-LABEL: @foo(
-; VECTORIZED-NEXT: entry:
-; VECTORIZED-NEXT: [[DIFF:%.*]] = alloca i32, align 4
-; VECTORIZED-NEXT: [[CB:%.*]] = alloca [16 x i8], align 16
-; VECTORIZED-NEXT: [[CC:%.*]] = alloca [16 x i8], align 16
-; VECTORIZED-NEXT: store i32 0, i32* [[DIFF]], align 4, !tbaa [[TBAA8:![0-9]+]]
-; VECTORIZED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; VECTORIZED: vector.ph:
-; VECTORIZED-NEXT: br label [[VECTOR_BODY:%.*]]
-; VECTORIZED: vector.body:
-; VECTORIZED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VECTORIZED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
-; VECTORIZED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; VECTORIZED-NEXT: [[TMP1:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CB]], i64 0, i64 [[TMP0]]
-; VECTORIZED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0
-; VECTORIZED-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>*
-; VECTORIZED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 1, !tbaa [[TBAA12:![0-9]+]]
-; VECTORIZED-NEXT: [[TMP4:%.*]] = sext <4 x i8> [[WIDE_LOAD]] to <4 x i32>
-; VECTORIZED-NEXT: [[TMP5:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CC]], i64 0, i64 [[TMP0]]
-; VECTORIZED-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP5]], i32 0
-; VECTORIZED-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>*
-; VECTORIZED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP7]], align 1, !tbaa [[TBAA12]]
-; VECTORIZED-NEXT: [[TMP8:%.*]] = sext <4 x i8> [[WIDE_LOAD1]] to <4 x i32>
-; VECTORIZED-NEXT: [[TMP9:%.*]] = sub <4 x i32> [[TMP4]], [[TMP8]]
-; VECTORIZED-NEXT: [[TMP10]] = add <4 x i32> [[TMP9]], [[VEC_PHI]]
-; VECTORIZED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VECTORIZED-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; VECTORIZED-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; VECTORIZED: middle.block:
-; VECTORIZED-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP10]])
-; VECTORIZED-NEXT: [[CMP_N:%.*]] = icmp eq i64 16, 16
-; VECTORIZED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; VECTORIZED: scalar.ph:
-; VECTORIZED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; VECTORIZED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
-; VECTORIZED-NEXT: br label [[FOR_BODY:%.*]]
-; VECTORIZED: for.body:
-; VECTORIZED-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; VECTORIZED-NEXT: [[ADD8:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CB]], i64 0, i64 [[INDVARS_IV]]
-; VECTORIZED-NEXT: [[TMP13:%.*]] = load i8, i8* [[ARRAYIDX]], align 1, !tbaa [[TBAA12]]
-; VECTORIZED-NEXT: [[CONV:%.*]] = sext i8 [[TMP13]] to i32
-; VECTORIZED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CC]], i64 0, i64 [[INDVARS_IV]]
-; VECTORIZED-NEXT: [[TMP14:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1, !tbaa [[TBAA12]]
-; VECTORIZED-NEXT: [[CONV3:%.*]] = sext i8 [[TMP14]] to i32
-; VECTORIZED-NEXT: [[SUB:%.*]] = sub i32 [[CONV]], [[CONV3]]
-; VECTORIZED-NEXT: [[ADD]] = add nsw i32 [[SUB]], [[ADD8]]
-; VECTORIZED-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; VECTORIZED-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 16
-; VECTORIZED-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
-; VECTORIZED: for.end:
-; VECTORIZED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
-; VECTORIZED-NEXT: store i32 [[ADD_LCSSA]], i32* [[DIFF]], align 4, !tbaa [[TBAA8]]
-; VECTORIZED-NEXT: call void @ibar(i32* [[DIFF]])
-; VECTORIZED-NEXT: ret i32 0
-;
-; UNROLLED-LABEL: @foo(
-; UNROLLED-NEXT: entry:
-; UNROLLED-NEXT: [[DIFF:%.*]] = alloca i32, align 4
-; UNROLLED-NEXT: [[CB:%.*]] = alloca [16 x i8], align 16
-; UNROLLED-NEXT: [[CC:%.*]] = alloca [16 x i8], align 16
-; UNROLLED-NEXT: store i32 0, i32* [[DIFF]], align 4, !tbaa [[TBAA8:![0-9]+]]
-; UNROLLED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; UNROLLED: vector.ph:
-; UNROLLED-NEXT: br label [[VECTOR_BODY:%.*]]
-; UNROLLED: vector.body:
-; UNROLLED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLLED-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[VECTOR_BODY]] ]
-; UNROLLED-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP29:%.*]], [[VECTOR_BODY]] ]
-; UNROLLED-NEXT: [[VEC_PHI2:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ]
-; UNROLLED-NEXT: [[VEC_PHI3:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP31:%.*]], [[VECTOR_BODY]] ]
-; UNROLLED-NEXT: [[INDUCTION:%.*]] = add i64 [[INDEX]], 0
-; UNROLLED-NEXT: [[INDUCTION4:%.*]] = add i64 [[INDEX]], 1
-; UNROLLED-NEXT: [[INDUCTION5:%.*]] = add i64 [[INDEX]], 2
-; UNROLLED-NEXT: [[INDUCTION6:%.*]] = add i64 [[INDEX]], 3
-; UNROLLED-NEXT: [[TMP0:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CB]], i64 0, i64 [[INDUCTION]]
-; UNROLLED-NEXT: [[TMP1:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CB]], i64 0, i64 [[INDUCTION4]]
-; UNROLLED-NEXT: [[TMP2:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CB]], i64 0, i64 [[INDUCTION5]]
-; UNROLLED-NEXT: [[TMP3:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CB]], i64 0, i64 [[INDUCTION6]]
-; UNROLLED-NEXT: [[TMP4:%.*]] = load i8, i8* [[TMP0]], align 1, !tbaa [[TBAA12:![0-9]+]]
-; UNROLLED-NEXT: [[TMP5:%.*]] = load i8, i8* [[TMP1]], align 1, !tbaa [[TBAA12]]
-; UNROLLED-NEXT: [[TMP6:%.*]] = load i8, i8* [[TMP2]], align 1, !tbaa [[TBAA12]]
-; UNROLLED-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP3]], align 1, !tbaa [[TBAA12]]
-; UNROLLED-NEXT: [[TMP8:%.*]] = sext i8 [[TMP4]] to i32
-; UNROLLED-NEXT: [[TMP9:%.*]] = sext i8 [[TMP5]] to i32
-; UNROLLED-NEXT: [[TMP10:%.*]] = sext i8 [[TMP6]] to i32
-; UNROLLED-NEXT: [[TMP11:%.*]] = sext i8 [[TMP7]] to i32
-; UNROLLED-NEXT: [[TMP12:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CC]], i64 0, i64 [[INDUCTION]]
-; UNROLLED-NEXT: [[TMP13:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CC]], i64 0, i64 [[INDUCTION4]]
-; UNROLLED-NEXT: [[TMP14:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CC]], i64 0, i64 [[INDUCTION5]]
-; UNROLLED-NEXT: [[TMP15:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CC]], i64 0, i64 [[INDUCTION6]]
-; UNROLLED-NEXT: [[TMP16:%.*]] = load i8, i8* [[TMP12]], align 1, !tbaa [[TBAA12]]
-; UNROLLED-NEXT: [[TMP17:%.*]] = load i8, i8* [[TMP13]], align 1, !tbaa [[TBAA12]]
-; UNROLLED-NEXT: [[TMP18:%.*]] = load i8, i8* [[TMP14]], align 1, !tbaa [[TBAA12]]
-; UNROLLED-NEXT: [[TMP19:%.*]] = load i8, i8* [[TMP15]], align 1, !tbaa [[TBAA12]]
-; UNROLLED-NEXT: [[TMP20:%.*]] = sext i8 [[TMP16]] to i32
-; UNROLLED-NEXT: [[TMP21:%.*]] = sext i8 [[TMP17]] to i32
-; UNROLLED-NEXT: [[TMP22:%.*]] = sext i8 [[TMP18]] to i32
-; UNROLLED-NEXT: [[TMP23:%.*]] = sext i8 [[TMP19]] to i32
-; UNROLLED-NEXT: [[TMP24:%.*]] = sub i32 [[TMP8]], [[TMP20]]
-; UNROLLED-NEXT: [[TMP25:%.*]] = sub i32 [[TMP9]], [[TMP21]]
-; UNROLLED-NEXT: [[TMP26:%.*]] = sub i32 [[TMP10]], [[TMP22]]
-; UNROLLED-NEXT: [[TMP27:%.*]] = sub i32 [[TMP11]], [[TMP23]]
-; UNROLLED-NEXT: [[TMP28]] = add i32 [[TMP24]], [[VEC_PHI]]
-; UNROLLED-NEXT: [[TMP29]] = add i32 [[TMP25]], [[VEC_PHI1]]
-; UNROLLED-NEXT: [[TMP30]] = add i32 [[TMP26]], [[VEC_PHI2]]
-; UNROLLED-NEXT: [[TMP31]] = add i32 [[TMP27]], [[VEC_PHI3]]
-; UNROLLED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; UNROLLED-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; UNROLLED-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; UNROLLED: middle.block:
-; UNROLLED-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP29]], [[TMP28]]
-; UNROLLED-NEXT: [[BIN_RDX7:%.*]] = add i32 [[TMP30]], [[BIN_RDX]]
-; UNROLLED-NEXT: [[BIN_RDX8:%.*]] = add i32 [[TMP31]], [[BIN_RDX7]]
-; UNROLLED-NEXT: [[CMP_N:%.*]] = icmp eq i64 16, 16
-; UNROLLED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; UNROLLED: scalar.ph:
-; UNROLLED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLLED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[BIN_RDX8]], [[MIDDLE_BLOCK]] ]
-; UNROLLED-NEXT: br label [[FOR_BODY:%.*]]
-; UNROLLED: for.body:
-; UNROLLED-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; UNROLLED-NEXT: [[ADD8:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; UNROLLED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CB]], i64 0, i64 [[INDVARS_IV]]
-; UNROLLED-NEXT: [[TMP33:%.*]] = load i8, i8* [[ARRAYIDX]], align 1, !tbaa [[TBAA12]]
-; UNROLLED-NEXT: [[CONV:%.*]] = sext i8 [[TMP33]] to i32
-; UNROLLED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CC]], i64 0, i64 [[INDVARS_IV]]
-; UNROLLED-NEXT: [[TMP34:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1, !tbaa [[TBAA12]]
-; UNROLLED-NEXT: [[CONV3:%.*]] = sext i8 [[TMP34]] to i32
-; UNROLLED-NEXT: [[SUB:%.*]] = sub i32 [[CONV]], [[CONV3]]
-; UNROLLED-NEXT: [[ADD]] = add nsw i32 [[SUB]], [[ADD8]]
-; UNROLLED-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; UNROLLED-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 16
-; UNROLLED-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
-; UNROLLED: for.end:
-; UNROLLED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[BIN_RDX8]], [[MIDDLE_BLOCK]] ]
-; UNROLLED-NEXT: store i32 [[ADD_LCSSA]], i32* [[DIFF]], align 4, !tbaa [[TBAA8]]
-; UNROLLED-NEXT: call void @ibar(i32* [[DIFF]])
-; UNROLLED-NEXT: ret i32 0
-;
-; NONE-LABEL: @foo(
-; NONE-NEXT: entry:
-; NONE-NEXT: [[DIFF:%.*]] = alloca i32, align 4
-; NONE-NEXT: [[CB:%.*]] = alloca [16 x i8], align 16
-; NONE-NEXT: [[CC:%.*]] = alloca [16 x i8], align 16
-; NONE-NEXT: store i32 0, i32* [[DIFF]], align 4, !tbaa [[TBAA8:![0-9]+]]
-; NONE-NEXT: br label [[FOR_BODY:%.*]]
-; NONE: for.body:
-; NONE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NONE-NEXT: [[ADD8:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; NONE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CB]], i64 0, i64 [[INDVARS_IV]]
-; NONE-NEXT: [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1, !tbaa [[TBAA12:![0-9]+]]
-; NONE-NEXT: [[CONV:%.*]] = sext i8 [[TMP0]] to i32
-; NONE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CC]], i64 0, i64 [[INDVARS_IV]]
-; NONE-NEXT: [[TMP1:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1, !tbaa [[TBAA12]]
-; NONE-NEXT: [[CONV3:%.*]] = sext i8 [[TMP1]] to i32
-; NONE-NEXT: [[SUB:%.*]] = sub i32 [[CONV]], [[CONV3]]
-; NONE-NEXT: [[ADD]] = add nsw i32 [[SUB]], [[ADD8]]
-; NONE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; NONE-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 16
-; NONE-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; NONE: for.end:
-; NONE-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ]
-; NONE-NEXT: store i32 [[ADD_LCSSA]], i32* [[DIFF]], align 4, !tbaa [[TBAA8]]
-; NONE-NEXT: call void @ibar(i32* [[DIFF]])
-; NONE-NEXT: ret i32 0
-;
entry:
%diff = alloca i32, align 4
%cb = alloca [16 x i8], align 16
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -loop-vectorize -pass-remarks-missed='loop-vectorize' -mtriple=x86_64-unknown-linux -S 2>&1 | FileCheck %s
; Verify analysis remarks are generated when interleaving is not beneficial.
; Function Attrs: nounwind uwtable
define void @do_not_interleave(float** noalias nocapture readonly %in, float* noalias nocapture %out, i32 %size) #0 !dbg !4 {
-; CHECK-LABEL: @do_not_interleave(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP_4:%.*]] = icmp eq i32 [[SIZE:%.*]], 0, !dbg [[DBG8:![0-9]+]]
-; CHECK-NEXT: br i1 [[CMP_4]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]], !dbg [[DBG9:![0-9]+]]
-; CHECK: for.body.preheader:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]], !dbg [[DBG10:![0-9]+]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float*, float** [[IN:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG10]]
-; CHECK-NEXT: [[TMP0:%.*]] = bitcast float** [[ARRAYIDX]] to i32**, !dbg [[DBG10]]
-; CHECK-NEXT: [[TMP1:%.*]] = load i32*, i32** [[TMP0]], align 8, !dbg [[DBG10]]
-; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4, !dbg [[DBG11:![0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG12:![0-9]+]]
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[ARRAYIDX2]] to i32*, !dbg [[DBG13:![0-9]+]]
-; CHECK-NEXT: store i32 [[TMP2]], i32* [[TMP3]], align 4, !dbg [[DBG13]]
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1, !dbg [[DBG9]]
-; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32, !dbg [[DBG9]]
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[SIZE]], !dbg [[DBG9]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]], !dbg [[DBG9]], !llvm.loop [[LOOP14:![0-9]+]]
-; CHECK: for.end.loopexit:
-; CHECK-NEXT: br label [[FOR_END]], !dbg [[DBG17:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void, !dbg [[DBG17]]
-;
entry:
%cmp.4 = icmp eq i32 %size, 0, !dbg !10
br i1 %cmp.4, label %for.end, label %for.body.preheader, !dbg !11
; Function Attrs: nounwind uwtable
define void @interleave_not_profitable(float** noalias nocapture readonly %in, float* noalias nocapture %out, i32 %size) #0 !dbg !6 {
-; CHECK-LABEL: @interleave_not_profitable(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP_4:%.*]] = icmp eq i32 [[SIZE:%.*]], 0, !dbg [[DBG19:![0-9]+]]
-; CHECK-NEXT: br i1 [[CMP_4]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]], !dbg [[DBG20:![0-9]+]]
-; CHECK: for.body.preheader:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]], !dbg [[DBG20]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float*, float** [[IN:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG21:![0-9]+]]
-; CHECK-NEXT: [[TMP0:%.*]] = bitcast float** [[ARRAYIDX]] to i32**, !dbg [[DBG21]]
-; CHECK-NEXT: [[TMP1:%.*]] = load i32*, i32** [[TMP0]], align 8, !dbg [[DBG21]]
-; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4, !dbg [[DBG22:![0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG23:![0-9]+]]
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[ARRAYIDX2]] to i32*, !dbg [[DBG24:![0-9]+]]
-; CHECK-NEXT: store i32 [[TMP2]], i32* [[TMP3]], align 4, !dbg [[DBG24]]
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1, !dbg [[DBG20]]
-; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32, !dbg [[DBG20]]
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[SIZE]], !dbg [[DBG20]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]], !dbg [[DBG20]], !llvm.loop [[LOOP25:![0-9]+]]
-; CHECK: for.end.loopexit:
-; CHECK-NEXT: br label [[FOR_END]], !dbg [[DBG26:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void, !dbg [[DBG26]]
-;
entry:
%cmp.4 = icmp eq i32 %size, 0, !dbg !20
br i1 %cmp.4, label %for.end, label %for.body, !dbg !21
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -loop-vectorize -mtriple=x86_64-unknown-linux -S -pass-remarks='loop-vectorize' 2>&1 | FileCheck -check-prefix=VECTORIZED %s
; RUN: opt < %s -loop-vectorize -force-vector-width=1 -force-vector-interleave=4 -mtriple=x86_64-unknown-linux -S -pass-remarks='loop-vectorize' 2>&1 | FileCheck -check-prefix=UNROLLED %s
; RUN: opt < %s -loop-vectorize -force-vector-width=1 -force-vector-interleave=1 -mtriple=x86_64-unknown-linux -S -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck -check-prefix=NONE %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
define i32 @foo(i32 %n) #0 !dbg !4 {
-; VECTORIZED-LABEL: @foo(
-; VECTORIZED-NEXT: entry:
-; VECTORIZED-NEXT: [[DIFF:%.*]] = alloca i32, align 4
-; VECTORIZED-NEXT: [[CB:%.*]] = alloca [16 x i8], align 16
-; VECTORIZED-NEXT: [[CC:%.*]] = alloca [16 x i8], align 16
-; VECTORIZED-NEXT: store i32 0, i32* [[DIFF]], align 4, !dbg [[DBG8:![0-9]+]], !tbaa [[TBAA9:![0-9]+]]
-; VECTORIZED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]], !dbg [[DBG13:![0-9]+]]
-; VECTORIZED: vector.ph:
-; VECTORIZED-NEXT: br label [[VECTOR_BODY:%.*]], !dbg [[DBG13]]
-; VECTORIZED: vector.body:
-; VECTORIZED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], !dbg [[DBG13]]
-; VECTORIZED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
-; VECTORIZED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; VECTORIZED-NEXT: [[TMP1:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CB]], i64 0, i64 [[TMP0]], !dbg [[DBG17:![0-9]+]]
-; VECTORIZED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0, !dbg [[DBG17]]
-; VECTORIZED-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>*, !dbg [[DBG17]]
-; VECTORIZED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 1, !dbg [[DBG17]], !tbaa [[TBAA19:![0-9]+]]
-; VECTORIZED-NEXT: [[TMP4:%.*]] = sext <4 x i8> [[WIDE_LOAD]] to <4 x i32>, !dbg [[DBG17]]
-; VECTORIZED-NEXT: [[TMP5:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CC]], i64 0, i64 [[TMP0]], !dbg [[DBG17]]
-; VECTORIZED-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP5]], i32 0, !dbg [[DBG17]]
-; VECTORIZED-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>*, !dbg [[DBG17]]
-; VECTORIZED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP7]], align 1, !dbg [[DBG17]], !tbaa [[TBAA19]]
-; VECTORIZED-NEXT: [[TMP8:%.*]] = sext <4 x i8> [[WIDE_LOAD1]] to <4 x i32>, !dbg [[DBG17]]
-; VECTORIZED-NEXT: [[TMP9:%.*]] = sub <4 x i32> [[TMP4]], [[TMP8]], !dbg [[DBG17]]
-; VECTORIZED-NEXT: [[TMP10]] = add <4 x i32> [[TMP9]], [[VEC_PHI]], !dbg [[DBG17]]
-; VECTORIZED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4, !dbg [[DBG13]]
-; VECTORIZED-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16, !dbg [[DBG13]]
-; VECTORIZED-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !dbg [[DBG13]], !llvm.loop [[LOOP20:![0-9]+]]
-; VECTORIZED: middle.block:
-; VECTORIZED-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP10]]), !dbg [[DBG13]]
-; VECTORIZED-NEXT: [[CMP_N:%.*]] = icmp eq i64 16, 16, !dbg [[DBG13]]
-; VECTORIZED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]], !dbg [[DBG13]]
-; VECTORIZED: scalar.ph:
-; VECTORIZED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; VECTORIZED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ]
-; VECTORIZED-NEXT: br label [[FOR_BODY:%.*]], !dbg [[DBG13]]
-; VECTORIZED: for.body:
-; VECTORIZED-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; VECTORIZED-NEXT: [[ADD8:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ], !dbg [[DBG17]]
-; VECTORIZED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CB]], i64 0, i64 [[INDVARS_IV]], !dbg [[DBG17]]
-; VECTORIZED-NEXT: [[TMP13:%.*]] = load i8, i8* [[ARRAYIDX]], align 1, !dbg [[DBG17]], !tbaa [[TBAA19]]
-; VECTORIZED-NEXT: [[CONV:%.*]] = sext i8 [[TMP13]] to i32, !dbg [[DBG17]]
-; VECTORIZED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CC]], i64 0, i64 [[INDVARS_IV]], !dbg [[DBG17]]
-; VECTORIZED-NEXT: [[TMP14:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1, !dbg [[DBG17]], !tbaa [[TBAA19]]
-; VECTORIZED-NEXT: [[CONV3:%.*]] = sext i8 [[TMP14]] to i32, !dbg [[DBG17]]
-; VECTORIZED-NEXT: [[SUB:%.*]] = sub i32 [[CONV]], [[CONV3]], !dbg [[DBG17]]
-; VECTORIZED-NEXT: [[ADD]] = add nsw i32 [[SUB]], [[ADD8]], !dbg [[DBG17]]
-; VECTORIZED-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1, !dbg [[DBG13]]
-; VECTORIZED-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 16, !dbg [[DBG13]]
-; VECTORIZED-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !dbg [[DBG13]], !llvm.loop [[LOOP22:![0-9]+]]
-; VECTORIZED: for.end:
-; VECTORIZED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ], !dbg [[DBG17]]
-; VECTORIZED-NEXT: store i32 [[ADD_LCSSA]], i32* [[DIFF]], align 4, !dbg [[DBG17]], !tbaa [[TBAA9]]
-; VECTORIZED-NEXT: call void @ibar(i32* [[DIFF]]), !dbg [[DBG24:![0-9]+]]
-; VECTORIZED-NEXT: ret i32 0, !dbg [[DBG25:![0-9]+]]
-;
-; UNROLLED-LABEL: @foo(
-; UNROLLED-NEXT: entry:
-; UNROLLED-NEXT: [[DIFF:%.*]] = alloca i32, align 4
-; UNROLLED-NEXT: [[CB:%.*]] = alloca [16 x i8], align 16
-; UNROLLED-NEXT: [[CC:%.*]] = alloca [16 x i8], align 16
-; UNROLLED-NEXT: store i32 0, i32* [[DIFF]], align 4, !dbg [[DBG8:![0-9]+]], !tbaa [[TBAA9:![0-9]+]]
-; UNROLLED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]], !dbg [[DBG13:![0-9]+]]
-; UNROLLED: vector.ph:
-; UNROLLED-NEXT: br label [[VECTOR_BODY:%.*]], !dbg [[DBG13]]
-; UNROLLED: vector.body:
-; UNROLLED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], !dbg [[DBG13]]
-; UNROLLED-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[VECTOR_BODY]] ]
-; UNROLLED-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP29:%.*]], [[VECTOR_BODY]] ]
-; UNROLLED-NEXT: [[VEC_PHI2:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ]
-; UNROLLED-NEXT: [[VEC_PHI3:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP31:%.*]], [[VECTOR_BODY]] ]
-; UNROLLED-NEXT: [[INDUCTION:%.*]] = add i64 [[INDEX]], 0
-; UNROLLED-NEXT: [[INDUCTION4:%.*]] = add i64 [[INDEX]], 1
-; UNROLLED-NEXT: [[INDUCTION5:%.*]] = add i64 [[INDEX]], 2
-; UNROLLED-NEXT: [[INDUCTION6:%.*]] = add i64 [[INDEX]], 3
-; UNROLLED-NEXT: [[TMP0:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CB]], i64 0, i64 [[INDUCTION]], !dbg [[DBG17:![0-9]+]]
-; UNROLLED-NEXT: [[TMP1:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CB]], i64 0, i64 [[INDUCTION4]], !dbg [[DBG17]]
-; UNROLLED-NEXT: [[TMP2:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CB]], i64 0, i64 [[INDUCTION5]], !dbg [[DBG17]]
-; UNROLLED-NEXT: [[TMP3:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CB]], i64 0, i64 [[INDUCTION6]], !dbg [[DBG17]]
-; UNROLLED-NEXT: [[TMP4:%.*]] = load i8, i8* [[TMP0]], align 1, !dbg [[DBG17]], !tbaa [[TBAA19:![0-9]+]]
-; UNROLLED-NEXT: [[TMP5:%.*]] = load i8, i8* [[TMP1]], align 1, !dbg [[DBG17]], !tbaa [[TBAA19]]
-; UNROLLED-NEXT: [[TMP6:%.*]] = load i8, i8* [[TMP2]], align 1, !dbg [[DBG17]], !tbaa [[TBAA19]]
-; UNROLLED-NEXT: [[TMP7:%.*]] = load i8, i8* [[TMP3]], align 1, !dbg [[DBG17]], !tbaa [[TBAA19]]
-; UNROLLED-NEXT: [[TMP8:%.*]] = sext i8 [[TMP4]] to i32, !dbg [[DBG17]]
-; UNROLLED-NEXT: [[TMP9:%.*]] = sext i8 [[TMP5]] to i32, !dbg [[DBG17]]
-; UNROLLED-NEXT: [[TMP10:%.*]] = sext i8 [[TMP6]] to i32, !dbg [[DBG17]]
-; UNROLLED-NEXT: [[TMP11:%.*]] = sext i8 [[TMP7]] to i32, !dbg [[DBG17]]
-; UNROLLED-NEXT: [[TMP12:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CC]], i64 0, i64 [[INDUCTION]], !dbg [[DBG17]]
-; UNROLLED-NEXT: [[TMP13:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CC]], i64 0, i64 [[INDUCTION4]], !dbg [[DBG17]]
-; UNROLLED-NEXT: [[TMP14:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CC]], i64 0, i64 [[INDUCTION5]], !dbg [[DBG17]]
-; UNROLLED-NEXT: [[TMP15:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CC]], i64 0, i64 [[INDUCTION6]], !dbg [[DBG17]]
-; UNROLLED-NEXT: [[TMP16:%.*]] = load i8, i8* [[TMP12]], align 1, !dbg [[DBG17]], !tbaa [[TBAA19]]
-; UNROLLED-NEXT: [[TMP17:%.*]] = load i8, i8* [[TMP13]], align 1, !dbg [[DBG17]], !tbaa [[TBAA19]]
-; UNROLLED-NEXT: [[TMP18:%.*]] = load i8, i8* [[TMP14]], align 1, !dbg [[DBG17]], !tbaa [[TBAA19]]
-; UNROLLED-NEXT: [[TMP19:%.*]] = load i8, i8* [[TMP15]], align 1, !dbg [[DBG17]], !tbaa [[TBAA19]]
-; UNROLLED-NEXT: [[TMP20:%.*]] = sext i8 [[TMP16]] to i32, !dbg [[DBG17]]
-; UNROLLED-NEXT: [[TMP21:%.*]] = sext i8 [[TMP17]] to i32, !dbg [[DBG17]]
-; UNROLLED-NEXT: [[TMP22:%.*]] = sext i8 [[TMP18]] to i32, !dbg [[DBG17]]
-; UNROLLED-NEXT: [[TMP23:%.*]] = sext i8 [[TMP19]] to i32, !dbg [[DBG17]]
-; UNROLLED-NEXT: [[TMP24:%.*]] = sub i32 [[TMP8]], [[TMP20]], !dbg [[DBG17]]
-; UNROLLED-NEXT: [[TMP25:%.*]] = sub i32 [[TMP9]], [[TMP21]], !dbg [[DBG17]]
-; UNROLLED-NEXT: [[TMP26:%.*]] = sub i32 [[TMP10]], [[TMP22]], !dbg [[DBG17]]
-; UNROLLED-NEXT: [[TMP27:%.*]] = sub i32 [[TMP11]], [[TMP23]], !dbg [[DBG17]]
-; UNROLLED-NEXT: [[TMP28]] = add i32 [[TMP24]], [[VEC_PHI]], !dbg [[DBG17]]
-; UNROLLED-NEXT: [[TMP29]] = add i32 [[TMP25]], [[VEC_PHI1]], !dbg [[DBG17]]
-; UNROLLED-NEXT: [[TMP30]] = add i32 [[TMP26]], [[VEC_PHI2]], !dbg [[DBG17]]
-; UNROLLED-NEXT: [[TMP31]] = add i32 [[TMP27]], [[VEC_PHI3]], !dbg [[DBG17]]
-; UNROLLED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4, !dbg [[DBG13]]
-; UNROLLED-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16, !dbg [[DBG13]]
-; UNROLLED-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !dbg [[DBG13]], !llvm.loop [[LOOP20:![0-9]+]]
-; UNROLLED: middle.block:
-; UNROLLED-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP29]], [[TMP28]], !dbg [[DBG13]]
-; UNROLLED-NEXT: [[BIN_RDX7:%.*]] = add i32 [[TMP30]], [[BIN_RDX]], !dbg [[DBG13]]
-; UNROLLED-NEXT: [[BIN_RDX8:%.*]] = add i32 [[TMP31]], [[BIN_RDX7]], !dbg [[DBG13]]
-; UNROLLED-NEXT: [[CMP_N:%.*]] = icmp eq i64 16, 16, !dbg [[DBG13]]
-; UNROLLED-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]], !dbg [[DBG13]]
-; UNROLLED: scalar.ph:
-; UNROLLED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UNROLLED-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[BIN_RDX8]], [[MIDDLE_BLOCK]] ]
-; UNROLLED-NEXT: br label [[FOR_BODY:%.*]], !dbg [[DBG13]]
-; UNROLLED: for.body:
-; UNROLLED-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; UNROLLED-NEXT: [[ADD8:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ], !dbg [[DBG17]]
-; UNROLLED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CB]], i64 0, i64 [[INDVARS_IV]], !dbg [[DBG17]]
-; UNROLLED-NEXT: [[TMP33:%.*]] = load i8, i8* [[ARRAYIDX]], align 1, !dbg [[DBG17]], !tbaa [[TBAA19]]
-; UNROLLED-NEXT: [[CONV:%.*]] = sext i8 [[TMP33]] to i32, !dbg [[DBG17]]
-; UNROLLED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CC]], i64 0, i64 [[INDVARS_IV]], !dbg [[DBG17]]
-; UNROLLED-NEXT: [[TMP34:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1, !dbg [[DBG17]], !tbaa [[TBAA19]]
-; UNROLLED-NEXT: [[CONV3:%.*]] = sext i8 [[TMP34]] to i32, !dbg [[DBG17]]
-; UNROLLED-NEXT: [[SUB:%.*]] = sub i32 [[CONV]], [[CONV3]], !dbg [[DBG17]]
-; UNROLLED-NEXT: [[ADD]] = add nsw i32 [[SUB]], [[ADD8]], !dbg [[DBG17]]
-; UNROLLED-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1, !dbg [[DBG13]]
-; UNROLLED-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 16, !dbg [[DBG13]]
-; UNROLLED-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !dbg [[DBG13]], !llvm.loop [[LOOP22:![0-9]+]]
-; UNROLLED: for.end:
-; UNROLLED-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[BIN_RDX8]], [[MIDDLE_BLOCK]] ], !dbg [[DBG17]]
-; UNROLLED-NEXT: store i32 [[ADD_LCSSA]], i32* [[DIFF]], align 4, !dbg [[DBG17]], !tbaa [[TBAA9]]
-; UNROLLED-NEXT: call void @ibar(i32* [[DIFF]]), !dbg [[DBG23:![0-9]+]]
-; UNROLLED-NEXT: ret i32 0, !dbg [[DBG24:![0-9]+]]
-;
-; NONE-LABEL: @foo(
-; NONE-NEXT: entry:
-; NONE-NEXT: [[DIFF:%.*]] = alloca i32, align 4
-; NONE-NEXT: [[CB:%.*]] = alloca [16 x i8], align 16
-; NONE-NEXT: [[CC:%.*]] = alloca [16 x i8], align 16
-; NONE-NEXT: store i32 0, i32* [[DIFF]], align 4, !dbg [[DBG8:![0-9]+]], !tbaa [[TBAA9:![0-9]+]]
-; NONE-NEXT: br label [[FOR_BODY:%.*]], !dbg [[DBG13:![0-9]+]]
-; NONE: for.body:
-; NONE-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; NONE-NEXT: [[ADD8:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD:%.*]], [[FOR_BODY]] ], !dbg [[DBG17:![0-9]+]]
-; NONE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CB]], i64 0, i64 [[INDVARS_IV]], !dbg [[DBG17]]
-; NONE-NEXT: [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1, !dbg [[DBG17]], !tbaa [[TBAA19:![0-9]+]]
-; NONE-NEXT: [[CONV:%.*]] = sext i8 [[TMP0]] to i32, !dbg [[DBG17]]
-; NONE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [16 x i8], [16 x i8]* [[CC]], i64 0, i64 [[INDVARS_IV]], !dbg [[DBG17]]
-; NONE-NEXT: [[TMP1:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1, !dbg [[DBG17]], !tbaa [[TBAA19]]
-; NONE-NEXT: [[CONV3:%.*]] = sext i8 [[TMP1]] to i32, !dbg [[DBG17]]
-; NONE-NEXT: [[SUB:%.*]] = sub i32 [[CONV]], [[CONV3]], !dbg [[DBG17]]
-; NONE-NEXT: [[ADD]] = add nsw i32 [[SUB]], [[ADD8]], !dbg [[DBG17]]
-; NONE-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1, !dbg [[DBG13]]
-; NONE-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 16, !dbg [[DBG13]]
-; NONE-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !dbg [[DBG13]]
-; NONE: for.end:
-; NONE-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], !dbg [[DBG17]]
-; NONE-NEXT: store i32 [[ADD_LCSSA]], i32* [[DIFF]], align 4, !dbg [[DBG17]], !tbaa [[TBAA9]]
-; NONE-NEXT: call void @ibar(i32* [[DIFF]]), !dbg [[DBG20:![0-9]+]]
-; NONE-NEXT: ret i32 0, !dbg [[DBG21:![0-9]+]]
-;
entry:
%diff = alloca i32, align 4
%cb = alloca [16 x i8], align 16
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -basic-aa -loop-vectorize < %s | FileCheck %s
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-apple-macosx10.11.0"
define i32 @accum(i32* nocapture readonly %x, i32 %N) #0 {
-; CHECK-LABEL: @accum(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; CHECK-NEXT: br i1 [[CMP1]], label [[FOR_INC_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK: for.inc.preheader:
-; CHECK-NEXT: br label [[FOR_INC:%.*]]
-; CHECK: for.inc:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC]] ], [ 0, [[FOR_INC_PREHEADER]] ]
-; CHECK-NEXT: [[SUM_02:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_INC]] ], [ 0, [[FOR_INC_PREHEADER]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[ADD]] = add nsw i32 [[TMP0]], [[SUM_02]]
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_INC]]
-; CHECK: for.end.loopexit:
-; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_INC]] ]
-; CHECK-NEXT: br label [[FOR_END]]
-; CHECK: for.end:
-; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_END_LOOPEXIT]] ]
-; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]]
-;
entry:
+; CHECK-LABEL: @accum
+; CHECK-NOT: x i32>
%cmp1 = icmp sgt i32 %N, 0
br i1 %cmp1, label %for.inc.preheader, label %for.end
%sum.0.lcssa = phi i32 [ 0, %entry ], [ %add.lcssa, %for.end.loopexit ]
ret i32 %sum.0.lcssa
+; CHECK: ret i32
}
attributes #0 = { "target-cpu"="core2" "target-features"="+sse,-avx,-avx2,-sse2" }
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -enable-interleaved-mem-accesses=true -force-vector-width=4 -loop-vectorize -S | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
;
; CHECK-NOT: load {{.*}} x x86_fp80
define x86_fp80 @foo(x86_fp80* %a) {
-; CHECK-LABEL: @foo(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x x86_fp80> [ <x86_fp80 undef, x86_fp80 0xK00000000000000000000, x86_fp80 0xK00000000000000000000, x86_fp80 0xK00000000000000000000>, [[VECTOR_PH]] ], [ [[TMP34:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[INDEX]] to i16
-; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i16 [[TMP0]], 2
-; CHECK-NEXT: [[TMP1:%.*]] = add i16 [[OFFSET_IDX]], 0
-; CHECK-NEXT: [[TMP2:%.*]] = add i16 [[OFFSET_IDX]], 2
-; CHECK-NEXT: [[TMP3:%.*]] = add i16 [[OFFSET_IDX]], 4
-; CHECK-NEXT: [[TMP4:%.*]] = add i16 [[OFFSET_IDX]], 6
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds x86_fp80, x86_fp80* [[A:%.*]], i16 [[TMP1]]
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds x86_fp80, x86_fp80* [[A]], i16 [[TMP2]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds x86_fp80, x86_fp80* [[A]], i16 [[TMP3]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds x86_fp80, x86_fp80* [[A]], i16 [[TMP4]]
-; CHECK-NEXT: [[TMP9:%.*]] = load x86_fp80, x86_fp80* [[TMP5]], align 1
-; CHECK-NEXT: [[TMP10:%.*]] = load x86_fp80, x86_fp80* [[TMP6]], align 1
-; CHECK-NEXT: [[TMP11:%.*]] = load x86_fp80, x86_fp80* [[TMP7]], align 1
-; CHECK-NEXT: [[TMP12:%.*]] = load x86_fp80, x86_fp80* [[TMP8]], align 1
-; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x x86_fp80> poison, x86_fp80 [[TMP9]], i32 0
-; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x x86_fp80> [[TMP13]], x86_fp80 [[TMP10]], i32 1
-; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x x86_fp80> [[TMP14]], x86_fp80 [[TMP11]], i32 2
-; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x x86_fp80> [[TMP15]], x86_fp80 [[TMP12]], i32 3
-; CHECK-NEXT: [[TMP17:%.*]] = or i16 [[TMP1]], 1
-; CHECK-NEXT: [[TMP18:%.*]] = or i16 [[TMP2]], 1
-; CHECK-NEXT: [[TMP19:%.*]] = or i16 [[TMP3]], 1
-; CHECK-NEXT: [[TMP20:%.*]] = or i16 [[TMP4]], 1
-; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds x86_fp80, x86_fp80* [[A]], i16 [[TMP17]]
-; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds x86_fp80, x86_fp80* [[A]], i16 [[TMP18]]
-; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds x86_fp80, x86_fp80* [[A]], i16 [[TMP19]]
-; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds x86_fp80, x86_fp80* [[A]], i16 [[TMP20]]
-; CHECK-NEXT: [[TMP25:%.*]] = load x86_fp80, x86_fp80* [[TMP21]], align 1
-; CHECK-NEXT: [[TMP26:%.*]] = load x86_fp80, x86_fp80* [[TMP22]], align 1
-; CHECK-NEXT: [[TMP27:%.*]] = load x86_fp80, x86_fp80* [[TMP23]], align 1
-; CHECK-NEXT: [[TMP28:%.*]] = load x86_fp80, x86_fp80* [[TMP24]], align 1
-; CHECK-NEXT: [[TMP29:%.*]] = insertelement <4 x x86_fp80> poison, x86_fp80 [[TMP25]], i32 0
-; CHECK-NEXT: [[TMP30:%.*]] = insertelement <4 x x86_fp80> [[TMP29]], x86_fp80 [[TMP26]], i32 1
-; CHECK-NEXT: [[TMP31:%.*]] = insertelement <4 x x86_fp80> [[TMP30]], x86_fp80 [[TMP27]], i32 2
-; CHECK-NEXT: [[TMP32:%.*]] = insertelement <4 x x86_fp80> [[TMP31]], x86_fp80 [[TMP28]], i32 3
-; CHECK-NEXT: [[TMP33:%.*]] = fadd fast <4 x x86_fp80> [[TMP16]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP34]] = fadd fast <4 x x86_fp80> [[TMP33]], [[TMP32]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT: [[TMP35:%.*]] = icmp eq i32 [[INDEX_NEXT]], 200
-; CHECK-NEXT: br i1 [[TMP35]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[TMP36:%.*]] = call fast x86_fp80 @llvm.vector.reduce.fadd.v4f80(x86_fp80 0xK80000000000000000000, <4 x x86_fp80> [[TMP34]])
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 200, 200
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i16 [ 400, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi x86_fp80 [ undef, [[ENTRY]] ], [ [[TMP36]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi x86_fp80 [ [[TMP40:%.*]], [[FOR_BODY]] ], [ [[TMP36]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: ret x86_fp80 [[DOTLCSSA]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I_09:%.*]] = phi i16 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD3:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[RES_08:%.*]] = phi x86_fp80 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[TMP40]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds x86_fp80, x86_fp80* [[A]], i16 [[I_09]]
-; CHECK-NEXT: [[TMP37:%.*]] = load x86_fp80, x86_fp80* [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[ADD:%.*]] = or i16 [[I_09]], 1
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds x86_fp80, x86_fp80* [[A]], i16 [[ADD]]
-; CHECK-NEXT: [[TMP38:%.*]] = load x86_fp80, x86_fp80* [[ARRAYIDX2]], align 1
-; CHECK-NEXT: [[TMP39:%.*]] = fadd fast x86_fp80 [[TMP37]], [[RES_08]]
-; CHECK-NEXT: [[TMP40]] = fadd fast x86_fp80 [[TMP39]], [[TMP38]]
-; CHECK-NEXT: [[ADD3]] = add nuw nsw i16 [[I_09]], 2
-; CHECK-NEXT: [[CMP:%.*]] = icmp ult i16 [[ADD3]], 400
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP2:![0-9]+]]
-;
entry:
br label %for.body
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -loop-vectorize -force-vector-width=2 -S %s | FileCheck %s
; Tests with alias sets that contain points with uncomputable bounds because
; Alias set with uncomputable bounds contains a single load. We do not need
; runtime checks for that group and it should not block vectorization.
define void @test1_uncomputable_bounds_single_load(i32* noalias %ptr.1, i32* noalias %ptr.2, i32* noalias %ptr.3, i64 %N, i64 %X) {
-; CHECK-LABEL: @test1_uncomputable_bounds_single_load(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[PTR_11:%.*]] = bitcast i32* [[PTR_1:%.*]] to i8*
-; CHECK-NEXT: [[COND:%.*]] = icmp sgt i64 [[N:%.*]], 0
-; CHECK-NEXT: br i1 [[COND]], label [[PH:%.*]], label [[EXIT:%.*]]
-; CHECK: ph:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK: vector.memcheck:
-; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[PTR_1]], i64 [[N]]
-; CHECK-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
-; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i32, i32* [[PTR_1]], i64 [[X:%.*]]
-; CHECK-NEXT: [[SCEVGEP34:%.*]] = bitcast i32* [[SCEVGEP3]] to i8*
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[X]], [[N]]
-; CHECK-NEXT: [[SCEVGEP5:%.*]] = getelementptr i32, i32* [[PTR_1]], i64 [[TMP0]]
-; CHECK-NEXT: [[SCEVGEP56:%.*]] = bitcast i32* [[SCEVGEP5]] to i8*
-; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[PTR_11]], [[SCEVGEP56]]
-; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[SCEVGEP34]], [[SCEVGEP2]]
-; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[PTR_3:%.*]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <2 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP4]], align 4
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[PTR_2:%.*]], i32 [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[PTR_2]], i32 [[TMP7]]
-; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP6]], align 4
-; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP8]], align 4
-; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> poison, i32 [[TMP9]], i32 0
-; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> [[TMP11]], i32 [[TMP10]], i32 1
-; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[PTR_1]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i32 0
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <2 x i32>*
-; CHECK-NEXT: store <2 x i32> [[TMP12]], <2 x i32>* [[TMP15]], align 4, !alias.scope !0, !noalias !3
-; CHECK-NEXT: [[TMP16:%.*]] = add nsw i64 [[TMP1]], [[X]]
-; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[PTR_1]], i64 [[TMP16]]
-; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP17]], i32 0
-; CHECK-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <2 x i32>*
-; CHECK-NEXT: store <2 x i32> [[TMP12]], <2 x i32>* [[TMP19]], align 4, !alias.scope !3
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[PH]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT: br label [[LOOP:%.*]]
-; CHECK: loop:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[PTR_3]], i64 [[IV]]
-; CHECK-NEXT: [[OFFSET_1:%.*]] = load i32, i32* [[GEP_1]], align 4
-; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[PTR_2]], i32 [[OFFSET_1]]
-; CHECK-NEXT: [[LV:%.*]] = load i32, i32* [[GEP_2]], align 4
-; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[PTR_1]], i64 [[IV]]
-; CHECK-NEXT: store i32 [[LV]], i32* [[GEP_3]], align 4
-; CHECK-NEXT: [[OFFSET_2:%.*]] = add nsw i64 [[IV]], [[X]]
-; CHECK-NEXT: [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[PTR_1]], i64 [[OFFSET_2]]
-; CHECK-NEXT: store i32 [[LV]], i32* [[GEP_4]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK: loop.exit:
-; CHECK-NEXT: br label [[EXIT]]
-; CHECK: exit:
-; CHECK-NEXT: ret void
-;
+; CHECK-LABEL: define void @test1_uncomputable_bounds_single_load
+; CHECK: vector.body
+; CHECK: ret void
entry:
%cond = icmp sgt i64 %N, 0
; Alias set with uncomputable bounds contains a single store. We do not need
; runtime checks for that group and it should not block vectorization.
define void @test2_uncomputable_bounds_single_store(i32* noalias %ptr.1, i32* noalias %ptr.2, i32* noalias %ptr.3, i64 %N, i64 %X) {
-; CHECK-LABEL: @test2_uncomputable_bounds_single_store(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[PTR_11:%.*]] = bitcast i32* [[PTR_1:%.*]] to i8*
-; CHECK-NEXT: [[COND:%.*]] = icmp sgt i64 [[N:%.*]], 0
-; CHECK-NEXT: br i1 [[COND]], label [[PH:%.*]], label [[EXIT:%.*]]
-; CHECK: ph:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK: vector.memcheck:
-; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[PTR_1]], i64 [[N]]
-; CHECK-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
-; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i32, i32* [[PTR_1]], i64 [[X:%.*]]
-; CHECK-NEXT: [[SCEVGEP34:%.*]] = bitcast i32* [[SCEVGEP3]] to i8*
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[X]], [[N]]
-; CHECK-NEXT: [[SCEVGEP5:%.*]] = getelementptr i32, i32* [[PTR_1]], i64 [[TMP0]]
-; CHECK-NEXT: [[SCEVGEP56:%.*]] = bitcast i32* [[SCEVGEP5]] to i8*
-; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[PTR_11]], [[SCEVGEP56]]
-; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[SCEVGEP34]], [[SCEVGEP2]]
-; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[PTR_3:%.*]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <2 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP4]], align 4
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 0
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[PTR_2:%.*]], i32 [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[PTR_2]], i32 [[TMP7]]
-; CHECK-NEXT: store i32 20, i32* [[TMP6]], align 4
-; CHECK-NEXT: store i32 20, i32* [[TMP8]], align 4
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[PTR_1]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <2 x i32>*
-; CHECK-NEXT: store <2 x i32> zeroinitializer, <2 x i32>* [[TMP11]], align 4, !alias.scope !8, !noalias !11
-; CHECK-NEXT: [[TMP12:%.*]] = add nsw i64 [[TMP1]], [[X]]
-; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[PTR_1]], i64 [[TMP12]]
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i32 0
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <2 x i32>*
-; CHECK-NEXT: store <2 x i32> <i32 10, i32 10>, <2 x i32>* [[TMP15]], align 4, !alias.scope !11
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[PH]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT: br label [[LOOP:%.*]]
-; CHECK: loop:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[PTR_3]], i64 [[IV]]
-; CHECK-NEXT: [[OFFSET_1:%.*]] = load i32, i32* [[GEP_1]], align 4
-; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[PTR_2]], i32 [[OFFSET_1]]
-; CHECK-NEXT: store i32 20, i32* [[GEP_2]], align 4
-; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[PTR_1]], i64 [[IV]]
-; CHECK-NEXT: store i32 0, i32* [[GEP_3]], align 4
-; CHECK-NEXT: [[OFFSET_2:%.*]] = add nsw i64 [[IV]], [[X]]
-; CHECK-NEXT: [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[PTR_1]], i64 [[OFFSET_2]]
-; CHECK-NEXT: store i32 10, i32* [[GEP_4]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP_EXIT]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]]
-; CHECK: loop.exit:
-; CHECK-NEXT: br label [[EXIT]]
-; CHECK: exit:
-; CHECK-NEXT: ret void
-;
+; CHECK-LABEL: define void @test2_uncomputable_bounds_single_store
+; CHECK: vector.body
+; CHECK: ret void
entry:
%cond = icmp sgt i64 %N, 0
; Alias set with uncomputable bounds contains a load and a store. This blocks
; vectorization, as we cannot generate runtime-checks for the set.
define void @test3_uncomputable_bounds_load_store(i32* noalias %ptr.1, i32* noalias %ptr.2, i32* noalias %ptr.3, i64 %N, i64 %X) {
-; CHECK-LABEL: @test3_uncomputable_bounds_load_store(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[COND:%.*]] = icmp sgt i64 [[N:%.*]], 0
-; CHECK-NEXT: br i1 [[COND]], label [[PH:%.*]], label [[EXIT:%.*]]
-; CHECK: ph:
-; CHECK-NEXT: br label [[LOOP:%.*]]
-; CHECK: loop:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[PTR_3:%.*]], i64 [[IV]]
-; CHECK-NEXT: [[OFFSET_1:%.*]] = load i32, i32* [[GEP_1]], align 4
-; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[PTR_2:%.*]], i32 [[OFFSET_1]]
-; CHECK-NEXT: store i32 20, i32* [[GEP_2]], align 4
-; CHECK-NEXT: [[GEP_22:%.*]] = getelementptr inbounds i32, i32* [[PTR_2]], i64 [[IV]]
-; CHECK-NEXT: [[LV:%.*]] = load i32, i32* [[GEP_22]], align 4
-; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[PTR_1:%.*]], i64 [[IV]]
-; CHECK-NEXT: store i32 [[LV]], i32* [[GEP_3]], align 4
-; CHECK-NEXT: [[OFFSET_2:%.*]] = add nsw i64 [[IV]], [[X:%.*]]
-; CHECK-NEXT: [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[PTR_1]], i64 [[OFFSET_2]]
-; CHECK-NEXT: store i32 [[LV]], i32* [[GEP_4]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP_EXIT:%.*]], label [[LOOP]]
-; CHECK: loop.exit:
-; CHECK-NEXT: br label [[EXIT]]
-; CHECK: exit:
-; CHECK-NEXT: ret void
-;
+; CHECK-LABEL: define void @test3_uncomputable_bounds_load_store
+; CHECK-NOT: vector.body
entry:
%cond = icmp sgt i64 %N, 0
; Alias set with uncomputable bounds contains a load and a store. This blocks
; vectorization, as we cannot generate runtime-checks for the set.
define void @test4_uncomputable_bounds_store_store(i32* noalias %ptr.1, i32* noalias %ptr.2, i32* noalias %ptr.3, i64 %N, i64 %X) {
-; CHECK-LABEL: @test4_uncomputable_bounds_store_store(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[COND:%.*]] = icmp sgt i64 [[N:%.*]], 0
-; CHECK-NEXT: br i1 [[COND]], label [[PH:%.*]], label [[EXIT:%.*]]
-; CHECK: ph:
-; CHECK-NEXT: br label [[LOOP:%.*]]
-; CHECK: loop:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[PTR_3:%.*]], i64 [[IV]]
-; CHECK-NEXT: [[OFFSET_1:%.*]] = load i32, i32* [[GEP_1]], align 4
-; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[PTR_2:%.*]], i32 [[OFFSET_1]]
-; CHECK-NEXT: store i32 20, i32* [[GEP_2]], align 4
-; CHECK-NEXT: [[GEP_22:%.*]] = getelementptr inbounds i32, i32* [[PTR_2]], i64 [[IV]]
-; CHECK-NEXT: store i32 30, i32* [[GEP_22]], align 4
-; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[PTR_1:%.*]], i64 [[IV]]
-; CHECK-NEXT: store i32 0, i32* [[GEP_3]], align 4
-; CHECK-NEXT: [[OFFSET_2:%.*]] = add nsw i64 [[IV]], [[X:%.*]]
-; CHECK-NEXT: [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[PTR_1]], i64 [[OFFSET_2]]
-; CHECK-NEXT: store i32 10, i32* [[GEP_4]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[LOOP_EXIT:%.*]], label [[LOOP]]
-; CHECK: loop.exit:
-; CHECK-NEXT: br label [[EXIT]]
-; CHECK: exit:
-; CHECK-NEXT: ret void
-;
+; CHECK-LABEL: define void @test4_uncomputable_bounds_store_store
+; CHECK-NOT: vector.body
entry:
%cond = icmp sgt i64 %N, 0
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -loop-vectorize -force-vector-width=2 -force-vector-interleave=2 -S | FileCheck %s
define void @test1(float* noalias nocapture %a, float* noalias nocapture readonly %b) {
; CHECK-LABEL: @test1(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <2 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 2
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <2 x float>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x float>, <2 x float>* [[TMP7]], align 4
-; CHECK-NEXT: [[TMP8:%.*]] = fcmp ogt <2 x float> [[WIDE_LOAD]], <float 1.000000e+02, float 1.000000e+02>
-; CHECK-NEXT: [[TMP9:%.*]] = fcmp ogt <2 x float> [[WIDE_LOAD1]], <float 1.000000e+02, float 1.000000e+02>
-; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP8]], i32 0
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP10]])
-; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i1> [[TMP8]], i32 1
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP11]])
-; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[TMP9]], i32 0
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP12]])
-; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP9]], i32 1
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP13]])
-; CHECK-NEXT: [[TMP14:%.*]] = fadd <2 x float> [[WIDE_LOAD]], <float 1.000000e+00, float 1.000000e+00>
-; CHECK-NEXT: [[TMP15:%.*]] = fadd <2 x float> [[WIDE_LOAD1]], <float 1.000000e+00, float 1.000000e+00>
-; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 0
-; CHECK-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP18]] to <2 x float>*
-; CHECK-NEXT: store <2 x float> [[TMP14]], <2 x float>* [[TMP19]], align 4
-; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 2
-; CHECK-NEXT: [[TMP21:%.*]] = bitcast float* [[TMP20]] to <2 x float>*
-; CHECK-NEXT: store <2 x float> [[TMP15]], <2 x float>* [[TMP21]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1600
-; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1600, 1600
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1600, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP23:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP23]], 1.000000e+02
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP1]])
-; CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP23]], 1.000000e+00
-; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store float [[ADD]], float* [[ARRAYIDX5]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV]], 1599
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK: [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* {{.*}}, align 4
+; CHECK: [[WIDE_LOAD1:%.*]] = load <2 x float>, <2 x float>* {{.*}}, align 4
+; CHECK-NEXT: [[TMP1:%.*]] = fcmp ogt <2 x float> [[WIDE_LOAD]], <float 1.000000e+02, float 1.000000e+02>
+; CHECK-NEXT: [[TMP2:%.*]] = fcmp ogt <2 x float> [[WIDE_LOAD1]], <float 1.000000e+02, float 1.000000e+02>
+; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
+; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP3]])
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
+; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP4]])
+; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
+; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP5]])
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
+; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP6]])
entry:
br label %for.body
define void @test2(%struct.data* nocapture readonly %d) {
; CHECK-LABEL: @test2(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_DATA:%.*]], %struct.data* [[D:%.*]], i64 0, i32 1
-; CHECK-NEXT: [[TMP0:%.*]] = load float*, float** [[B]], align 8
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[TMP0]] to i8*
-; CHECK-NEXT: [[PTRINT:%.*]] = ptrtoint float* [[TMP0]] to i64
-; CHECK-NEXT: [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 31
-; CHECK-NEXT: [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
-; CHECK-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT_DATA]], %struct.data* [[D]], i64 0, i32 0
-; CHECK-NEXT: [[TMP2:%.*]] = load float*, float** [[A]], align 8
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to i8*
-; CHECK-NEXT: [[PTRINT2:%.*]] = ptrtoint float* [[TMP2]] to i64
-; CHECK-NEXT: [[MASKEDPTR3:%.*]] = and i64 [[PTRINT2]], 31
-; CHECK-NEXT: [[MASKCOND4:%.*]] = icmp eq i64 [[MASKEDPTR3]], 0
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK: vector.memcheck:
-; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[TMP2]], i64 1600
-; CHECK-NEXT: [[SCEVGEP1:%.*]] = bitcast float* [[SCEVGEP]] to i8*
-; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr float, float* [[TMP0]], i64 1600
-; CHECK-NEXT: [[SCEVGEP23:%.*]] = bitcast float* [[SCEVGEP2]] to i8*
-; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[TMP3]], [[SCEVGEP23]]
-; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[TMP1]], [[SCEVGEP1]]
-; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: entry:
+; CHECK: [[MASKCOND:%.*]] = icmp eq i64 %maskedptr, 0
+; CHECK: [[MASKCOND4:%.*]] = icmp eq i64 %maskedptr3, 0
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]])
+; CHECK: tail call void @llvm.assume(i1 [[MASKCOND]])
; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]])
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP0]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP0]], i64 [[TMP5]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 0
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <2 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP9]], align 4, !alias.scope !4
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 2
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to <2 x float>*
-; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <2 x float>, <2 x float>* [[TMP11]], align 4, !alias.scope !4
-; CHECK-NEXT: [[TMP12:%.*]] = fadd <2 x float> [[WIDE_LOAD]], <float 1.000000e+00, float 1.000000e+00>
-; CHECK-NEXT: [[TMP13:%.*]] = fadd <2 x float> [[WIDE_LOAD4]], <float 1.000000e+00, float 1.000000e+00>
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND4]])
+; CHECK: tail call void @llvm.assume(i1 [[MASKCOND4]])
; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND4]])
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP2]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP2]], i64 [[TMP5]]
-; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP14]], i32 0
-; CHECK-NEXT: [[TMP17:%.*]] = bitcast float* [[TMP16]] to <2 x float>*
-; CHECK-NEXT: store <2 x float> [[TMP12]], <2 x float>* [[TMP17]], align 4, !alias.scope !7, !noalias !4
-; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP14]], i32 2
-; CHECK-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP18]] to <2 x float>*
-; CHECK-NEXT: store <2 x float> [[TMP13]], <2 x float>* [[TMP19]], align 4, !alias.scope !7, !noalias !4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1600
-; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1600, 1600
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1600, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]])
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[TMP0]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP21:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP21]], 1.000000e+00
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND4]])
-; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[TMP2]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store float [[ADD]], float* [[ARRAYIDX5]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV]], 1599
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
entry:
%b = getelementptr inbounds %struct.data, %struct.data* %d, i64 0, i32 1
%0 = load float*, float** %b, align 8
define void @predicated_assume(float* noalias nocapture readonly %a, float* noalias nocapture %b, i32 %n) {
; Check that the vector.body does not contain any assumes.
; CHECK-LABEL: @predicated_assume(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP15:%.*]] = icmp eq i32 [[N:%.*]], 0
-; CHECK-NEXT: br i1 [[CMP15]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
-; CHECK: for.body.preheader:
-; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[STEP_ADD:%.*]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP3:%.*]] = icmp ult <2 x i64> [[VEC_IND]], <i64 495616, i64 495616>
-; CHECK-NEXT: [[TMP4:%.*]] = icmp ult <2 x i64> [[STEP_ADD]], <i64 495616, i64 495616>
-; CHECK-NEXT: [[TMP5:%.*]] = icmp ult <2 x i64> [[VEC_IND]], <i64 991232, i64 991232>
-; CHECK-NEXT: [[TMP6:%.*]] = icmp ult <2 x i64> [[STEP_ADD]], <i64 991232, i64 991232>
-; CHECK-NEXT: [[TMP7:%.*]] = xor <2 x i1> [[TMP3]], <i1 true, i1 true>
-; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i1> [[TMP4]], <i1 true, i1 true>
-; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP3]], <2 x float> <float 2.300000e+01, float 2.300000e+01>, <2 x float> <float 4.200000e+01, float 4.200000e+01>
-; CHECK-NEXT: [[PREDPHI2:%.*]] = select <2 x i1> [[TMP4]], <2 x float> <float 2.300000e+01, float 2.300000e+01>, <2 x float> <float 4.200000e+01, float 4.200000e+01>
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP2]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[TMP9]], i32 0
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast float* [[TMP11]] to <2 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP12]], align 4
-; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[TMP9]], i32 2
-; CHECK-NEXT: [[TMP14:%.*]] = bitcast float* [[TMP13]] to <2 x float>*
-; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <2 x float>, <2 x float>* [[TMP14]], align 4
-; CHECK-NEXT: [[TMP15:%.*]] = fmul <2 x float> [[PREDPHI]], [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP16:%.*]] = fmul <2 x float> [[PREDPHI2]], [[WIDE_LOAD3]]
-; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP2]]
-; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float* [[TMP17]], i32 0
-; CHECK-NEXT: [[TMP20:%.*]] = bitcast float* [[TMP19]] to <2 x float>*
-; CHECK-NEXT: store <2 x float> [[TMP15]], <2 x float>* [[TMP20]], align 4
-; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[TMP17]], i32 2
-; CHECK-NEXT: [[TMP22:%.*]] = bitcast float* [[TMP21]] to <2 x float>*
-; CHECK-NEXT: store <2 x float> [[TMP16]], <2 x float>* [[TMP22]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], <i64 2, i64 2>
-; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.cond.cleanup.loopexit:
-; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: ret void
+; CHECK-NOT: llvm.assume
; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[IF_END5:%.*]] ]
-; CHECK-NEXT: [[CMP1:%.*]] = icmp ult i64 [[INDVARS_IV]], 495616
-; CHECK-NEXT: br i1 [[CMP1]], label [[IF_END5]], label [[IF_ELSE:%.*]]
-; CHECK: if.else:
-; CHECK-NEXT: [[CMP2:%.*]] = icmp ult i64 [[INDVARS_IV]], 991232
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP2]])
-; CHECK-NEXT: br label [[IF_END5]]
-; CHECK: if.end5:
-; CHECK-NEXT: [[X_0:%.*]] = phi float [ 4.200000e+01, [[IF_ELSE]] ], [ 2.300000e+01, [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP24:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[MUL:%.*]] = fmul float [[X_0]], [[TMP24]]
-; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store float [[MUL]], float* [[ARRAYIDX7]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[TMP0]]
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-;
entry:
%cmp15 = icmp eq i32 %n, 0
br i1 %cmp15, label %for.cond.cleanup, label %for.body.preheader
; Function Attrs: nofree norecurse nounwind uwtable
define dso_local void @_Z3foov() local_unnamed_addr #0 {
; CHECK-LABEL: @_Z3foov(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @b, i64 0, i64 [[TMP0]]
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4, !tbaa [[TBAA2:![0-9]+]]
-; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD]], [[VEC_IND]]
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 [[TMP0]]
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4, !tbaa [[TBAA2]]
-; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[TMP4]]
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; CHECK-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* [[TMP9]], align 4, !tbaa [[TBAA2]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, 1024
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: ret void
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @b, i64 0, i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
-; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP11]], [[TMP12]]
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4, !tbaa [[TBAA2]]
-; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], [[MUL]]
-; CHECK-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4, !tbaa [[TBAA2]]
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !prof [[PROF9:![0-9]+]], !llvm.loop [[LOOP10:![0-9]+]]
-;
-; CHECK-MASKED-LABEL: @_Z3foov(
-; CHECK-MASKED-NEXT: entry:
-; CHECK-MASKED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-MASKED: vector.ph:
-; CHECK-MASKED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-MASKED: vector.body:
-; CHECK-MASKED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MASKED-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MASKED-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-MASKED-NEXT: [[STEP_ADD1:%.*]] = add <4 x i32> [[STEP_ADD]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-MASKED-NEXT: [[STEP_ADD2:%.*]] = add <4 x i32> [[STEP_ADD1]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-MASKED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-MASKED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-MASKED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8
-; CHECK-MASKED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 12
-; CHECK-MASKED-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @b, i64 0, i64 [[TMP0]]
-; CHECK-MASKED-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @b, i64 0, i64 [[TMP1]]
-; CHECK-MASKED-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @b, i64 0, i64 [[TMP2]]
-; CHECK-MASKED-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @b, i64 0, i64 [[TMP3]]
-; CHECK-MASKED-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
-; CHECK-MASKED-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
-; CHECK-MASKED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4, !tbaa [[TBAA2:![0-9]+]]
-; CHECK-MASKED-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 4
-; CHECK-MASKED-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
-; CHECK-MASKED-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 8
-; CHECK-MASKED-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>*
-; CHECK-MASKED-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 12
-; CHECK-MASKED-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <4 x i32>*
-; CHECK-MASKED-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP15]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT: [[TMP16:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD]], [[VEC_IND]]
-; CHECK-MASKED-NEXT: [[TMP17:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD4]], [[STEP_ADD]]
-; CHECK-MASKED-NEXT: [[TMP18:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD5]], [[STEP_ADD1]]
-; CHECK-MASKED-NEXT: [[TMP19:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD6]], [[STEP_ADD2]]
-; CHECK-MASKED-NEXT: [[TMP20:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 [[TMP0]]
-; CHECK-MASKED-NEXT: [[TMP21:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 [[TMP1]]
-; CHECK-MASKED-NEXT: [[TMP22:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 [[TMP2]]
-; CHECK-MASKED-NEXT: [[TMP23:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 [[TMP3]]
-; CHECK-MASKED-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 0
-; CHECK-MASKED-NEXT: [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <4 x i32>*
-; CHECK-MASKED-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP25]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 4
-; CHECK-MASKED-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
-; CHECK-MASKED-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP27]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 8
-; CHECK-MASKED-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
-; CHECK-MASKED-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 12
-; CHECK-MASKED-NEXT: [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <4 x i32>*
-; CHECK-MASKED-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP31]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT: [[TMP32:%.*]] = add nsw <4 x i32> [[WIDE_LOAD7]], [[TMP16]]
-; CHECK-MASKED-NEXT: [[TMP33:%.*]] = add nsw <4 x i32> [[WIDE_LOAD8]], [[TMP17]]
-; CHECK-MASKED-NEXT: [[TMP34:%.*]] = add nsw <4 x i32> [[WIDE_LOAD9]], [[TMP18]]
-; CHECK-MASKED-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD10]], [[TMP19]]
-; CHECK-MASKED-NEXT: [[TMP36:%.*]] = bitcast i32* [[TMP24]] to <4 x i32>*
-; CHECK-MASKED-NEXT: store <4 x i32> [[TMP32]], <4 x i32>* [[TMP36]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT: [[TMP37:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
-; CHECK-MASKED-NEXT: store <4 x i32> [[TMP33]], <4 x i32>* [[TMP37]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT: [[TMP38:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
-; CHECK-MASKED-NEXT: store <4 x i32> [[TMP34]], <4 x i32>* [[TMP38]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT: [[TMP39:%.*]] = bitcast i32* [[TMP30]] to <4 x i32>*
-; CHECK-MASKED-NEXT: store <4 x i32> [[TMP35]], <4 x i32>* [[TMP39]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-MASKED-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD2]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-MASKED-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-MASKED-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK-MASKED: middle.block:
-; CHECK-MASKED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, 1024
-; CHECK-MASKED-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK-MASKED: scalar.ph:
-; CHECK-MASKED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-MASKED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-MASKED: for.cond.cleanup:
-; CHECK-MASKED-NEXT: ret void
-; CHECK-MASKED: for.body:
-; CHECK-MASKED-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-MASKED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @b, i64 0, i64 [[INDVARS_IV]]
-; CHECK-MASKED-NEXT: [[TMP41:%.*]] = load i32, i32* [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT: [[TMP42:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-MASKED-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP41]], [[TMP42]]
-; CHECK-MASKED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 [[INDVARS_IV]]
-; CHECK-MASKED-NEXT: [[TMP43:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP43]], [[MUL]]
-; CHECK-MASKED-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-MASKED-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024
-; CHECK-MASKED-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !prof [[PROF9:![0-9]+]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK: [[VECTOR_BODY:vector\.body]]:
+; CHECK: br i1 [[TMP:%.*]], label [[MIDDLE_BLOCK:%.*]], label %[[VECTOR_BODY]], !prof [[LP1_255:\!.*]],
+; CHECK: [[FOR_BODY:for\.body]]:
+; CHECK: br i1 [[EXITCOND:%.*]], label [[FOR_END_LOOPEXIT:%.*]], label %[[FOR_BODY]], !prof [[LP0_0:\!.*]],
+; CHECK-MASKED: [[VECTOR_BODY:vector\.body]]:
+; CHECK-MASKED: br i1 [[TMP:%.*]], label [[MIDDLE_BLOCK:%.*]], label %[[VECTOR_BODY]], !prof [[LP1_63:\!.*]],
+; CHECK-MASKED: [[FOR_BODY:for\.body]]:
+; CHECK-MASKED: br i1 [[EXITCOND:%.*]], label [[FOR_END_LOOPEXIT:%.*]], label %[[FOR_BODY]], !prof [[LP0_0:\!.*]],
;
entry:
br label %for.body
; Function Attrs: nofree norecurse nounwind uwtable
define dso_local void @_Z3foo2v() local_unnamed_addr #0 {
; CHECK-LABEL: @_Z3foo2v(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @b, i64 0, i64 [[TMP0]]
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4, !tbaa [[TBAA2]]
-; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD]], [[VEC_IND]]
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 [[TMP0]]
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4, !tbaa [[TBAA2]]
-; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[TMP4]]
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; CHECK-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* [[TMP9]], align 4, !tbaa [[TBAA2]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF6]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1027, 1024
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: ret void
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @b, i64 0, i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
-; CHECK-NEXT: [[TMP12:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP11]], [[TMP12]]
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4, !tbaa [[TBAA2]]
-; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], [[MUL]]
-; CHECK-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4, !tbaa [[TBAA2]]
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1027
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !prof [[PROF13:![0-9]+]], !llvm.loop [[LOOP14:![0-9]+]]
-;
-; CHECK-MASKED-LABEL: @_Z3foo2v(
-; CHECK-MASKED-NEXT: entry:
-; CHECK-MASKED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-MASKED: vector.ph:
-; CHECK-MASKED-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-MASKED: vector.body:
-; CHECK-MASKED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MASKED-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MASKED-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-MASKED-NEXT: [[STEP_ADD1:%.*]] = add <4 x i32> [[STEP_ADD]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-MASKED-NEXT: [[STEP_ADD2:%.*]] = add <4 x i32> [[STEP_ADD1]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-MASKED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-MASKED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-MASKED-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 8
-; CHECK-MASKED-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 12
-; CHECK-MASKED-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @b, i64 0, i64 [[TMP0]]
-; CHECK-MASKED-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @b, i64 0, i64 [[TMP1]]
-; CHECK-MASKED-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @b, i64 0, i64 [[TMP2]]
-; CHECK-MASKED-NEXT: [[TMP7:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @b, i64 0, i64 [[TMP3]]
-; CHECK-MASKED-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
-; CHECK-MASKED-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
-; CHECK-MASKED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 4
-; CHECK-MASKED-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
-; CHECK-MASKED-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 8
-; CHECK-MASKED-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>*
-; CHECK-MASKED-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 12
-; CHECK-MASKED-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <4 x i32>*
-; CHECK-MASKED-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP15]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT: [[TMP16:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD]], [[VEC_IND]]
-; CHECK-MASKED-NEXT: [[TMP17:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD4]], [[STEP_ADD]]
-; CHECK-MASKED-NEXT: [[TMP18:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD5]], [[STEP_ADD1]]
-; CHECK-MASKED-NEXT: [[TMP19:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD6]], [[STEP_ADD2]]
-; CHECK-MASKED-NEXT: [[TMP20:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 [[TMP0]]
-; CHECK-MASKED-NEXT: [[TMP21:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 [[TMP1]]
-; CHECK-MASKED-NEXT: [[TMP22:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 [[TMP2]]
-; CHECK-MASKED-NEXT: [[TMP23:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 [[TMP3]]
-; CHECK-MASKED-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 0
-; CHECK-MASKED-NEXT: [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <4 x i32>*
-; CHECK-MASKED-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP25]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 4
-; CHECK-MASKED-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
-; CHECK-MASKED-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP27]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 8
-; CHECK-MASKED-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
-; CHECK-MASKED-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 12
-; CHECK-MASKED-NEXT: [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <4 x i32>*
-; CHECK-MASKED-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x i32>, <4 x i32>* [[TMP31]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT: [[TMP32:%.*]] = add nsw <4 x i32> [[WIDE_LOAD7]], [[TMP16]]
-; CHECK-MASKED-NEXT: [[TMP33:%.*]] = add nsw <4 x i32> [[WIDE_LOAD8]], [[TMP17]]
-; CHECK-MASKED-NEXT: [[TMP34:%.*]] = add nsw <4 x i32> [[WIDE_LOAD9]], [[TMP18]]
-; CHECK-MASKED-NEXT: [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD10]], [[TMP19]]
-; CHECK-MASKED-NEXT: [[TMP36:%.*]] = bitcast i32* [[TMP24]] to <4 x i32>*
-; CHECK-MASKED-NEXT: store <4 x i32> [[TMP32]], <4 x i32>* [[TMP36]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT: [[TMP37:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
-; CHECK-MASKED-NEXT: store <4 x i32> [[TMP33]], <4 x i32>* [[TMP37]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT: [[TMP38:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
-; CHECK-MASKED-NEXT: store <4 x i32> [[TMP34]], <4 x i32>* [[TMP38]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT: [[TMP39:%.*]] = bitcast i32* [[TMP30]] to <4 x i32>*
-; CHECK-MASKED-NEXT: store <4 x i32> [[TMP35]], <4 x i32>* [[TMP39]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-MASKED-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD2]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-MASKED-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-MASKED-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF6]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK-MASKED: middle.block:
-; CHECK-MASKED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1027, 1024
-; CHECK-MASKED-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK-MASKED: scalar.ph:
-; CHECK-MASKED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-MASKED-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK-MASKED: for.cond.cleanup:
-; CHECK-MASKED-NEXT: ret void
-; CHECK-MASKED: for.body:
-; CHECK-MASKED-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-MASKED-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @b, i64 0, i64 [[INDVARS_IV]]
-; CHECK-MASKED-NEXT: [[TMP41:%.*]] = load i32, i32* [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT: [[TMP42:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-MASKED-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP41]], [[TMP42]]
-; CHECK-MASKED-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 [[INDVARS_IV]]
-; CHECK-MASKED-NEXT: [[TMP43:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP43]], [[MUL]]
-; CHECK-MASKED-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4, !tbaa [[TBAA2]]
-; CHECK-MASKED-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-MASKED-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1027
-; CHECK-MASKED-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !prof [[PROF13:![0-9]+]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK: [[VECTOR_BODY:vector\.body]]:
+; CHECK: br i1 [[TMP:%.*]], label [[MIDDLE_BLOCK:%.*]], label %[[VECTOR_BODY]], !prof [[LP1_255:\!.*]],
+; CHECK: [[FOR_BODY:for\.body]]:
+; CHECK: br i1 [[EXITCOND:%.*]], label [[FOR_END_LOOPEXIT:%.*]], label %[[FOR_BODY]], !prof [[LP1_2:\!.*]],
+; CHECK-MASKED: [[VECTOR_BODY:vector\.body]]:
+; CHECK-MASKED: br i1 [[TMP:%.*]], label [[MIDDLE_BLOCK:%.*]], label %[[VECTOR_BODY]], !prof [[LP1_63:\!.*]],
+; CHECK-MASKED: [[FOR_BODY:for\.body]]:
+; CHECK-MASKED: br i1 [[EXITCOND:%.*]], label [[FOR_END_LOOPEXIT:%.*]], label %[[FOR_BODY]], !prof [[LP1_2:\!.*]],
;
entry:
br label %for.body
!llvm.module.flags = !{!0}
!llvm.ident = !{!1}
+; CHECK: [[LP1_255]] = !{!"branch_weights", i32 1, i32 255}
+; CHECK: [[LP0_0]] = !{!"branch_weights", i32 0, i32 0}
+; CHECK-MASKED: [[LP1_63]] = !{!"branch_weights", i32 1, i32 63}
+; CHECK-MASKED: [[LP0_0]] = !{!"branch_weights", i32 0, i32 0}
+; CHECK: [[LP1_2]] = !{!"branch_weights", i32 1, i32 2}
!0 = !{i32 1, !"wchar_size", i32 4}
!1 = !{!"clang version 10.0.0 (https://github.com/llvm/llvm-project c292b5b5e059e6ce3e6449e6827ef7e1037c21c4)"}
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -enable-cond-stores-vec=false -loop-vectorize -S -pass-remarks-missed='loop-vectorize' -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck %s
; RUN: opt < %s -enable-cond-stores-vec=false -passes=loop-vectorize -S -pass-remarks-missed='loop-vectorize' -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck %s
; Function Attrs: nounwind ssp uwtable
define void @conditional_store(i32* noalias nocapture %indices) #0 !dbg !4 {
-; CHECK-LABEL: @conditional_store(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]], !dbg [[DBG8:![0-9]+]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[INDICES:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG10:![0-9]+]]
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4, !dbg [[DBG10]], !tbaa [[TBAA12:![0-9]+]]
-; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[TMP0]], 1024, !dbg [[DBG10]]
-; CHECK-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]], !dbg [[DBG10]]
-; CHECK: if.then:
-; CHECK-NEXT: store i32 0, i32* [[ARRAYIDX]], align 4, !dbg [[DBG16:![0-9]+]], !tbaa [[TBAA12]]
-; CHECK-NEXT: br label [[FOR_INC]], !dbg [[DBG16]]
-; CHECK: for.inc:
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1, !dbg [[DBG8]]
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4096, !dbg [[DBG8]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !dbg [[DBG8]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void, !dbg [[DBG17:![0-9]+]]
-;
entry:
br label %for.body, !dbg !10
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -loop-vectorize -pass-remarks-analysis=loop-vectorize -pass-remarks-with-hotness < %s 2>&1 | FileCheck %s
; RUN: opt -S -passes=loop-vectorize -pass-remarks-analysis=loop-vectorize -pass-remarks-with-hotness < %s 2>&1 | FileCheck %s
; Function Attrs: norecurse nounwind ssp uwtable
define void @cold(i8* nocapture %A, i8* nocapture readonly %B, i8* nocapture %C, i8* nocapture readonly %D, i8* nocapture readonly %E, i32 %N) local_unnamed_addr #0 !dbg !7 !prof !56 {
-; CHECK-LABEL: @cold(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP28:%.*]] = icmp sgt i32 [[N:%.*]], 0, !dbg [[DBG10:![0-9]+]]
-; CHECK-NEXT: br i1 [[CMP28]], label [[PH:%.*]], label [[FOR_COND_CLEANUP:%.*]], !dbg [[DBG11:![0-9]+]], !prof [[PROF12:![0-9]+]]
-; CHECK: ph:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[PH]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG13:![0-9]+]]
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1, !dbg [[DBG13]], !tbaa [[TBAA14:![0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG17:![0-9]+]]
-; CHECK-NEXT: [[TMP1:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1, !dbg [[DBG17]], !tbaa [[TBAA14]]
-; CHECK-NEXT: [[ADD:%.*]] = add i8 [[TMP1]], [[TMP0]], !dbg [[DBG18:![0-9]+]]
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1, !dbg [[DBG11]]
-; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[INDVARS_IV_NEXT]], !dbg [[DBG19:![0-9]+]]
-; CHECK-NEXT: store i8 [[ADD]], i8* [[ARRAYIDX7]], align 1, !dbg [[DBG20:![0-9]+]], !tbaa [[TBAA14]]
-; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, i8* [[D:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG21:![0-9]+]]
-; CHECK-NEXT: [[TMP2:%.*]] = load i8, i8* [[ARRAYIDX9]], align 1, !dbg [[DBG21]], !tbaa [[TBAA14]]
-; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i8, i8* [[E:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG22:![0-9]+]]
-; CHECK-NEXT: [[TMP3:%.*]] = load i8, i8* [[ARRAYIDX12]], align 1, !dbg [[DBG22]], !tbaa [[TBAA14]]
-; CHECK-NEXT: [[MUL:%.*]] = mul i8 [[TMP3]], [[TMP2]], !dbg [[DBG23:![0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i8, i8* [[C:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG24:![0-9]+]]
-; CHECK-NEXT: store i8 [[MUL]], i8* [[ARRAYIDX16]], align 1, !dbg [[DBG25:![0-9]+]], !tbaa [[TBAA14]]
-; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32, !dbg [[DBG11]]
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]], !dbg [[DBG11]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]], !dbg [[DBG11]], !prof [[PROF26:![0-9]+]], !llvm.loop [[LOOP27:![0-9]+]]
-; CHECK: for.cond.cleanup.loopexit:
-; CHECK-NEXT: br label [[FOR_COND_CLEANUP]], !dbg [[DBG28:![0-9]+]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: ret void, !dbg [[DBG28]]
-;
entry:
%cmp28 = icmp sgt i32 %N, 0, !dbg !9
br i1 %cmp28, label %ph, label %for.cond.cleanup, !dbg !10, !prof !58
; Function Attrs: norecurse nounwind ssp uwtable
define void @hot(i8* nocapture %A, i8* nocapture readonly %B, i8* nocapture %C, i8* nocapture readonly %D, i8* nocapture readonly %E, i32 %N) local_unnamed_addr #0 !dbg !26 !prof !57 {
-; CHECK-LABEL: @hot(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP28:%.*]] = icmp sgt i32 [[N:%.*]], 0, !dbg [[DBG31:![0-9]+]]
-; CHECK-NEXT: br i1 [[CMP28]], label [[PH:%.*]], label [[FOR_COND_CLEANUP:%.*]], !dbg [[DBG32:![0-9]+]], !prof [[PROF12]]
-; CHECK: ph:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[PH]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG33:![0-9]+]]
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1, !dbg [[DBG33]], !tbaa [[TBAA14]]
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG34:![0-9]+]]
-; CHECK-NEXT: [[TMP1:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1, !dbg [[DBG34]], !tbaa [[TBAA14]]
-; CHECK-NEXT: [[ADD:%.*]] = add i8 [[TMP1]], [[TMP0]], !dbg [[DBG35:![0-9]+]]
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1, !dbg [[DBG32]]
-; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[INDVARS_IV_NEXT]], !dbg [[DBG36:![0-9]+]]
-; CHECK-NEXT: store i8 [[ADD]], i8* [[ARRAYIDX7]], align 1, !dbg [[DBG37:![0-9]+]], !tbaa [[TBAA14]]
-; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, i8* [[D:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG38:![0-9]+]]
-; CHECK-NEXT: [[TMP2:%.*]] = load i8, i8* [[ARRAYIDX9]], align 1, !dbg [[DBG38]], !tbaa [[TBAA14]]
-; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i8, i8* [[E:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG39:![0-9]+]]
-; CHECK-NEXT: [[TMP3:%.*]] = load i8, i8* [[ARRAYIDX12]], align 1, !dbg [[DBG39]], !tbaa [[TBAA14]]
-; CHECK-NEXT: [[MUL:%.*]] = mul i8 [[TMP3]], [[TMP2]], !dbg [[DBG40:![0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i8, i8* [[C:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG41:![0-9]+]]
-; CHECK-NEXT: store i8 [[MUL]], i8* [[ARRAYIDX16]], align 1, !dbg [[DBG42:![0-9]+]], !tbaa [[TBAA14]]
-; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32, !dbg [[DBG32]]
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]], !dbg [[DBG32]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]], !dbg [[DBG32]], !prof [[PROF26]], !llvm.loop [[LOOP43:![0-9]+]]
-; CHECK: for.cond.cleanup.loopexit:
-; CHECK-NEXT: br label [[FOR_COND_CLEANUP]], !dbg [[DBG44:![0-9]+]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: ret void, !dbg [[DBG44]]
-;
entry:
%cmp28 = icmp sgt i32 %N, 0, !dbg !27
br i1 %cmp28, label %ph, label %for.cond.cleanup, !dbg !28, !prof !58
; Function Attrs: norecurse nounwind ssp uwtable
define void @unknown(i8* nocapture %A, i8* nocapture readonly %B, i8* nocapture %C, i8* nocapture readonly %D, i8* nocapture readonly %E, i32 %N) local_unnamed_addr #0 !dbg !41 {
-; CHECK-LABEL: @unknown(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP28:%.*]] = icmp sgt i32 [[N:%.*]], 0, !dbg [[DBG46:![0-9]+]]
-; CHECK-NEXT: br i1 [[CMP28]], label [[PH:%.*]], label [[FOR_COND_CLEANUP:%.*]], !dbg [[DBG47:![0-9]+]]
-; CHECK: ph:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[PH]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG48:![0-9]+]]
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1, !dbg [[DBG48]], !tbaa [[TBAA14]]
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG49:![0-9]+]]
-; CHECK-NEXT: [[TMP1:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1, !dbg [[DBG49]], !tbaa [[TBAA14]]
-; CHECK-NEXT: [[ADD:%.*]] = add i8 [[TMP1]], [[TMP0]], !dbg [[DBG50:![0-9]+]]
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1, !dbg [[DBG47]]
-; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[INDVARS_IV_NEXT]], !dbg [[DBG51:![0-9]+]]
-; CHECK-NEXT: store i8 [[ADD]], i8* [[ARRAYIDX7]], align 1, !dbg [[DBG52:![0-9]+]], !tbaa [[TBAA14]]
-; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, i8* [[D:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG53:![0-9]+]]
-; CHECK-NEXT: [[TMP2:%.*]] = load i8, i8* [[ARRAYIDX9]], align 1, !dbg [[DBG53]], !tbaa [[TBAA14]]
-; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i8, i8* [[E:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG54:![0-9]+]]
-; CHECK-NEXT: [[TMP3:%.*]] = load i8, i8* [[ARRAYIDX12]], align 1, !dbg [[DBG54]], !tbaa [[TBAA14]]
-; CHECK-NEXT: [[MUL:%.*]] = mul i8 [[TMP3]], [[TMP2]], !dbg [[DBG55:![0-9]+]]
-; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i8, i8* [[C:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG56:![0-9]+]]
-; CHECK-NEXT: store i8 [[MUL]], i8* [[ARRAYIDX16]], align 1, !dbg [[DBG57:![0-9]+]], !tbaa [[TBAA14]]
-; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32, !dbg [[DBG47]]
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]], !dbg [[DBG47]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]], !dbg [[DBG47]], !llvm.loop [[LOOP58:![0-9]+]]
-; CHECK: for.cond.cleanup.loopexit:
-; CHECK-NEXT: br label [[FOR_COND_CLEANUP]], !dbg [[DBG59:![0-9]+]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: ret void, !dbg [[DBG59]]
-;
entry:
%cmp28 = icmp sgt i32 %N, 0, !dbg !42
br i1 %cmp28, label %ph, label %for.cond.cleanup, !dbg !43
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -loop-vectorize -pass-remarks-missed=loop-vectorize \
; RUN: -pass-remarks-with-hotness < %s 2>&1 | \
; RUN: FileCheck -check-prefix=HOTNESS -check-prefix=BOTH %s
; Function Attrs: norecurse nounwind ssp uwtable
define void @cold(i8* nocapture %A, i8* nocapture readonly %B, i8* nocapture %C, i8* nocapture readonly %D, i8* nocapture readonly %E, i32 %N) local_unnamed_addr #0 !dbg !7 !prof !56 {
-; HOTNESS-LABEL: @cold(
-; HOTNESS-NEXT: entry:
-; HOTNESS-NEXT: [[CMP28:%.*]] = icmp sgt i32 [[N:%.*]], 0, !dbg [[DBG10:![0-9]+]]
-; HOTNESS-NEXT: br i1 [[CMP28]], label [[PH:%.*]], label [[FOR_COND_CLEANUP:%.*]], !dbg [[DBG11:![0-9]+]], !prof [[PROF12:![0-9]+]]
-; HOTNESS: ph:
-; HOTNESS-NEXT: br label [[FOR_BODY:%.*]]
-; HOTNESS: for.body:
-; HOTNESS-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[PH]] ]
-; HOTNESS-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG13:![0-9]+]]
-; HOTNESS-NEXT: [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1, !dbg [[DBG13]], !tbaa [[TBAA14:![0-9]+]]
-; HOTNESS-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG17:![0-9]+]]
-; HOTNESS-NEXT: [[TMP1:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1, !dbg [[DBG17]], !tbaa [[TBAA14]]
-; HOTNESS-NEXT: [[ADD:%.*]] = add i8 [[TMP1]], [[TMP0]], !dbg [[DBG18:![0-9]+]]
-; HOTNESS-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1, !dbg [[DBG11]]
-; HOTNESS-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[INDVARS_IV_NEXT]], !dbg [[DBG19:![0-9]+]]
-; HOTNESS-NEXT: store i8 [[ADD]], i8* [[ARRAYIDX7]], align 1, !dbg [[DBG20:![0-9]+]], !tbaa [[TBAA14]]
-; HOTNESS-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, i8* [[D:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG21:![0-9]+]]
-; HOTNESS-NEXT: [[TMP2:%.*]] = load i8, i8* [[ARRAYIDX9]], align 1, !dbg [[DBG21]], !tbaa [[TBAA14]]
-; HOTNESS-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i8, i8* [[E:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG22:![0-9]+]]
-; HOTNESS-NEXT: [[TMP3:%.*]] = load i8, i8* [[ARRAYIDX12]], align 1, !dbg [[DBG22]], !tbaa [[TBAA14]]
-; HOTNESS-NEXT: [[MUL:%.*]] = mul i8 [[TMP3]], [[TMP2]], !dbg [[DBG23:![0-9]+]]
-; HOTNESS-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i8, i8* [[C:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG24:![0-9]+]]
-; HOTNESS-NEXT: store i8 [[MUL]], i8* [[ARRAYIDX16]], align 1, !dbg [[DBG25:![0-9]+]], !tbaa [[TBAA14]]
-; HOTNESS-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32, !dbg [[DBG11]]
-; HOTNESS-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]], !dbg [[DBG11]]
-; HOTNESS-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]], !dbg [[DBG11]], !prof [[PROF26:![0-9]+]], !llvm.loop [[LOOP27:![0-9]+]]
-; HOTNESS: for.cond.cleanup.loopexit:
-; HOTNESS-NEXT: br label [[FOR_COND_CLEANUP]], !dbg [[DBG28:![0-9]+]]
-; HOTNESS: for.cond.cleanup:
-; HOTNESS-NEXT: ret void, !dbg [[DBG28]]
-;
-; NO_HOTNESS-LABEL: @cold(
-; NO_HOTNESS-NEXT: entry:
-; NO_HOTNESS-NEXT: [[CMP28:%.*]] = icmp sgt i32 [[N:%.*]], 0, !dbg [[DBG10:![0-9]+]]
-; NO_HOTNESS-NEXT: br i1 [[CMP28]], label [[PH:%.*]], label [[FOR_COND_CLEANUP:%.*]], !dbg [[DBG11:![0-9]+]], !prof [[PROF12:![0-9]+]]
-; NO_HOTNESS: ph:
-; NO_HOTNESS-NEXT: br label [[FOR_BODY:%.*]]
-; NO_HOTNESS: for.body:
-; NO_HOTNESS-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[PH]] ]
-; NO_HOTNESS-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG13:![0-9]+]]
-; NO_HOTNESS-NEXT: [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1, !dbg [[DBG13]], !tbaa [[TBAA14:![0-9]+]]
-; NO_HOTNESS-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG17:![0-9]+]]
-; NO_HOTNESS-NEXT: [[TMP1:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1, !dbg [[DBG17]], !tbaa [[TBAA14]]
-; NO_HOTNESS-NEXT: [[ADD:%.*]] = add i8 [[TMP1]], [[TMP0]], !dbg [[DBG18:![0-9]+]]
-; NO_HOTNESS-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1, !dbg [[DBG11]]
-; NO_HOTNESS-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[INDVARS_IV_NEXT]], !dbg [[DBG19:![0-9]+]]
-; NO_HOTNESS-NEXT: store i8 [[ADD]], i8* [[ARRAYIDX7]], align 1, !dbg [[DBG20:![0-9]+]], !tbaa [[TBAA14]]
-; NO_HOTNESS-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, i8* [[D:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG21:![0-9]+]]
-; NO_HOTNESS-NEXT: [[TMP2:%.*]] = load i8, i8* [[ARRAYIDX9]], align 1, !dbg [[DBG21]], !tbaa [[TBAA14]]
-; NO_HOTNESS-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i8, i8* [[E:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG22:![0-9]+]]
-; NO_HOTNESS-NEXT: [[TMP3:%.*]] = load i8, i8* [[ARRAYIDX12]], align 1, !dbg [[DBG22]], !tbaa [[TBAA14]]
-; NO_HOTNESS-NEXT: [[MUL:%.*]] = mul i8 [[TMP3]], [[TMP2]], !dbg [[DBG23:![0-9]+]]
-; NO_HOTNESS-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i8, i8* [[C:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG24:![0-9]+]]
-; NO_HOTNESS-NEXT: store i8 [[MUL]], i8* [[ARRAYIDX16]], align 1, !dbg [[DBG25:![0-9]+]], !tbaa [[TBAA14]]
-; NO_HOTNESS-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32, !dbg [[DBG11]]
-; NO_HOTNESS-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]], !dbg [[DBG11]]
-; NO_HOTNESS-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]], !dbg [[DBG11]], !prof [[PROF26:![0-9]+]], !llvm.loop [[LOOP27:![0-9]+]]
-; NO_HOTNESS: for.cond.cleanup.loopexit:
-; NO_HOTNESS-NEXT: br label [[FOR_COND_CLEANUP]], !dbg [[DBG28:![0-9]+]]
-; NO_HOTNESS: for.cond.cleanup:
-; NO_HOTNESS-NEXT: ret void, !dbg [[DBG28]]
-;
entry:
%cmp28 = icmp sgt i32 %N, 0, !dbg !9
br i1 %cmp28, label %ph, label %for.cond.cleanup, !dbg !10, !prof !58
; Function Attrs: norecurse nounwind ssp uwtable
define void @hot(i8* nocapture %A, i8* nocapture readonly %B, i8* nocapture %C, i8* nocapture readonly %D, i8* nocapture readonly %E, i32 %N) local_unnamed_addr #0 !dbg !26 !prof !57 {
-; HOTNESS-LABEL: @hot(
-; HOTNESS-NEXT: entry:
-; HOTNESS-NEXT: [[CMP28:%.*]] = icmp sgt i32 [[N:%.*]], 0, !dbg [[DBG31:![0-9]+]]
-; HOTNESS-NEXT: br i1 [[CMP28]], label [[PH:%.*]], label [[FOR_COND_CLEANUP:%.*]], !dbg [[DBG32:![0-9]+]], !prof [[PROF12]]
-; HOTNESS: ph:
-; HOTNESS-NEXT: br label [[FOR_BODY:%.*]]
-; HOTNESS: for.body:
-; HOTNESS-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[PH]] ]
-; HOTNESS-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG33:![0-9]+]]
-; HOTNESS-NEXT: [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1, !dbg [[DBG33]], !tbaa [[TBAA14]]
-; HOTNESS-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG34:![0-9]+]]
-; HOTNESS-NEXT: [[TMP1:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1, !dbg [[DBG34]], !tbaa [[TBAA14]]
-; HOTNESS-NEXT: [[ADD:%.*]] = add i8 [[TMP1]], [[TMP0]], !dbg [[DBG35:![0-9]+]]
-; HOTNESS-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1, !dbg [[DBG32]]
-; HOTNESS-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[INDVARS_IV_NEXT]], !dbg [[DBG36:![0-9]+]]
-; HOTNESS-NEXT: store i8 [[ADD]], i8* [[ARRAYIDX7]], align 1, !dbg [[DBG37:![0-9]+]], !tbaa [[TBAA14]]
-; HOTNESS-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, i8* [[D:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG38:![0-9]+]]
-; HOTNESS-NEXT: [[TMP2:%.*]] = load i8, i8* [[ARRAYIDX9]], align 1, !dbg [[DBG38]], !tbaa [[TBAA14]]
-; HOTNESS-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i8, i8* [[E:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG39:![0-9]+]]
-; HOTNESS-NEXT: [[TMP3:%.*]] = load i8, i8* [[ARRAYIDX12]], align 1, !dbg [[DBG39]], !tbaa [[TBAA14]]
-; HOTNESS-NEXT: [[MUL:%.*]] = mul i8 [[TMP3]], [[TMP2]], !dbg [[DBG40:![0-9]+]]
-; HOTNESS-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i8, i8* [[C:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG41:![0-9]+]]
-; HOTNESS-NEXT: store i8 [[MUL]], i8* [[ARRAYIDX16]], align 1, !dbg [[DBG42:![0-9]+]], !tbaa [[TBAA14]]
-; HOTNESS-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32, !dbg [[DBG32]]
-; HOTNESS-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]], !dbg [[DBG32]]
-; HOTNESS-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]], !dbg [[DBG32]], !prof [[PROF26]], !llvm.loop [[LOOP43:![0-9]+]]
-; HOTNESS: for.cond.cleanup.loopexit:
-; HOTNESS-NEXT: br label [[FOR_COND_CLEANUP]], !dbg [[DBG44:![0-9]+]]
-; HOTNESS: for.cond.cleanup:
-; HOTNESS-NEXT: ret void, !dbg [[DBG44]]
-;
-; NO_HOTNESS-LABEL: @hot(
-; NO_HOTNESS-NEXT: entry:
-; NO_HOTNESS-NEXT: [[CMP28:%.*]] = icmp sgt i32 [[N:%.*]], 0, !dbg [[DBG31:![0-9]+]]
-; NO_HOTNESS-NEXT: br i1 [[CMP28]], label [[PH:%.*]], label [[FOR_COND_CLEANUP:%.*]], !dbg [[DBG32:![0-9]+]], !prof [[PROF12]]
-; NO_HOTNESS: ph:
-; NO_HOTNESS-NEXT: br label [[FOR_BODY:%.*]]
-; NO_HOTNESS: for.body:
-; NO_HOTNESS-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[PH]] ]
-; NO_HOTNESS-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG33:![0-9]+]]
-; NO_HOTNESS-NEXT: [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1, !dbg [[DBG33]], !tbaa [[TBAA14]]
-; NO_HOTNESS-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG34:![0-9]+]]
-; NO_HOTNESS-NEXT: [[TMP1:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1, !dbg [[DBG34]], !tbaa [[TBAA14]]
-; NO_HOTNESS-NEXT: [[ADD:%.*]] = add i8 [[TMP1]], [[TMP0]], !dbg [[DBG35:![0-9]+]]
-; NO_HOTNESS-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1, !dbg [[DBG32]]
-; NO_HOTNESS-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[INDVARS_IV_NEXT]], !dbg [[DBG36:![0-9]+]]
-; NO_HOTNESS-NEXT: store i8 [[ADD]], i8* [[ARRAYIDX7]], align 1, !dbg [[DBG37:![0-9]+]], !tbaa [[TBAA14]]
-; NO_HOTNESS-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, i8* [[D:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG38:![0-9]+]]
-; NO_HOTNESS-NEXT: [[TMP2:%.*]] = load i8, i8* [[ARRAYIDX9]], align 1, !dbg [[DBG38]], !tbaa [[TBAA14]]
-; NO_HOTNESS-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i8, i8* [[E:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG39:![0-9]+]]
-; NO_HOTNESS-NEXT: [[TMP3:%.*]] = load i8, i8* [[ARRAYIDX12]], align 1, !dbg [[DBG39]], !tbaa [[TBAA14]]
-; NO_HOTNESS-NEXT: [[MUL:%.*]] = mul i8 [[TMP3]], [[TMP2]], !dbg [[DBG40:![0-9]+]]
-; NO_HOTNESS-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i8, i8* [[C:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG41:![0-9]+]]
-; NO_HOTNESS-NEXT: store i8 [[MUL]], i8* [[ARRAYIDX16]], align 1, !dbg [[DBG42:![0-9]+]], !tbaa [[TBAA14]]
-; NO_HOTNESS-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32, !dbg [[DBG32]]
-; NO_HOTNESS-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]], !dbg [[DBG32]]
-; NO_HOTNESS-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]], !dbg [[DBG32]], !prof [[PROF26]], !llvm.loop [[LOOP43:![0-9]+]]
-; NO_HOTNESS: for.cond.cleanup.loopexit:
-; NO_HOTNESS-NEXT: br label [[FOR_COND_CLEANUP]], !dbg [[DBG44:![0-9]+]]
-; NO_HOTNESS: for.cond.cleanup:
-; NO_HOTNESS-NEXT: ret void, !dbg [[DBG44]]
-;
entry:
%cmp28 = icmp sgt i32 %N, 0, !dbg !27
br i1 %cmp28, label %ph, label %for.cond.cleanup, !dbg !28, !prof !58
; Function Attrs: norecurse nounwind ssp uwtable
define void @unknown(i8* nocapture %A, i8* nocapture readonly %B, i8* nocapture %C, i8* nocapture readonly %D, i8* nocapture readonly %E, i32 %N) local_unnamed_addr #0 !dbg !41 {
-; HOTNESS-LABEL: @unknown(
-; HOTNESS-NEXT: entry:
-; HOTNESS-NEXT: [[CMP28:%.*]] = icmp sgt i32 [[N:%.*]], 0, !dbg [[DBG46:![0-9]+]]
-; HOTNESS-NEXT: br i1 [[CMP28]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]], !dbg [[DBG47:![0-9]+]]
-; HOTNESS: for.body.preheader:
-; HOTNESS-NEXT: br label [[FOR_BODY:%.*]], !dbg [[DBG47]]
-; HOTNESS: for.cond.cleanup.loopexit:
-; HOTNESS-NEXT: br label [[FOR_COND_CLEANUP]], !dbg [[DBG48:![0-9]+]]
-; HOTNESS: for.cond.cleanup:
-; HOTNESS-NEXT: ret void, !dbg [[DBG48]]
-; HOTNESS: for.body:
-; HOTNESS-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; HOTNESS-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG49:![0-9]+]]
-; HOTNESS-NEXT: [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1, !dbg [[DBG49]], !tbaa [[TBAA14]]
-; HOTNESS-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG50:![0-9]+]]
-; HOTNESS-NEXT: [[TMP1:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1, !dbg [[DBG50]], !tbaa [[TBAA14]]
-; HOTNESS-NEXT: [[ADD:%.*]] = add i8 [[TMP1]], [[TMP0]], !dbg [[DBG51:![0-9]+]]
-; HOTNESS-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1, !dbg [[DBG47]]
-; HOTNESS-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[INDVARS_IV_NEXT]], !dbg [[DBG52:![0-9]+]]
-; HOTNESS-NEXT: store i8 [[ADD]], i8* [[ARRAYIDX7]], align 1, !dbg [[DBG53:![0-9]+]], !tbaa [[TBAA14]]
-; HOTNESS-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, i8* [[D:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG54:![0-9]+]]
-; HOTNESS-NEXT: [[TMP2:%.*]] = load i8, i8* [[ARRAYIDX9]], align 1, !dbg [[DBG54]], !tbaa [[TBAA14]]
-; HOTNESS-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i8, i8* [[E:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG55:![0-9]+]]
-; HOTNESS-NEXT: [[TMP3:%.*]] = load i8, i8* [[ARRAYIDX12]], align 1, !dbg [[DBG55]], !tbaa [[TBAA14]]
-; HOTNESS-NEXT: [[MUL:%.*]] = mul i8 [[TMP3]], [[TMP2]], !dbg [[DBG56:![0-9]+]]
-; HOTNESS-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i8, i8* [[C:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG57:![0-9]+]]
-; HOTNESS-NEXT: store i8 [[MUL]], i8* [[ARRAYIDX16]], align 1, !dbg [[DBG58:![0-9]+]], !tbaa [[TBAA14]]
-; HOTNESS-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32, !dbg [[DBG47]]
-; HOTNESS-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]], !dbg [[DBG47]]
-; HOTNESS-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]], !dbg [[DBG47]], !llvm.loop [[LOOP59:![0-9]+]]
-;
-; NO_HOTNESS-LABEL: @unknown(
-; NO_HOTNESS-NEXT: entry:
-; NO_HOTNESS-NEXT: [[CMP28:%.*]] = icmp sgt i32 [[N:%.*]], 0, !dbg [[DBG46:![0-9]+]]
-; NO_HOTNESS-NEXT: br i1 [[CMP28]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]], !dbg [[DBG47:![0-9]+]]
-; NO_HOTNESS: for.body.preheader:
-; NO_HOTNESS-NEXT: br label [[FOR_BODY:%.*]], !dbg [[DBG47]]
-; NO_HOTNESS: for.cond.cleanup.loopexit:
-; NO_HOTNESS-NEXT: br label [[FOR_COND_CLEANUP]], !dbg [[DBG48:![0-9]+]]
-; NO_HOTNESS: for.cond.cleanup:
-; NO_HOTNESS-NEXT: ret void, !dbg [[DBG48]]
-; NO_HOTNESS: for.body:
-; NO_HOTNESS-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; NO_HOTNESS-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG49:![0-9]+]]
-; NO_HOTNESS-NEXT: [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1, !dbg [[DBG49]], !tbaa [[TBAA14]]
-; NO_HOTNESS-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG50:![0-9]+]]
-; NO_HOTNESS-NEXT: [[TMP1:%.*]] = load i8, i8* [[ARRAYIDX2]], align 1, !dbg [[DBG50]], !tbaa [[TBAA14]]
-; NO_HOTNESS-NEXT: [[ADD:%.*]] = add i8 [[TMP1]], [[TMP0]], !dbg [[DBG51:![0-9]+]]
-; NO_HOTNESS-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1, !dbg [[DBG47]]
-; NO_HOTNESS-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[INDVARS_IV_NEXT]], !dbg [[DBG52:![0-9]+]]
-; NO_HOTNESS-NEXT: store i8 [[ADD]], i8* [[ARRAYIDX7]], align 1, !dbg [[DBG53:![0-9]+]], !tbaa [[TBAA14]]
-; NO_HOTNESS-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, i8* [[D:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG54:![0-9]+]]
-; NO_HOTNESS-NEXT: [[TMP2:%.*]] = load i8, i8* [[ARRAYIDX9]], align 1, !dbg [[DBG54]], !tbaa [[TBAA14]]
-; NO_HOTNESS-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i8, i8* [[E:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG55:![0-9]+]]
-; NO_HOTNESS-NEXT: [[TMP3:%.*]] = load i8, i8* [[ARRAYIDX12]], align 1, !dbg [[DBG55]], !tbaa [[TBAA14]]
-; NO_HOTNESS-NEXT: [[MUL:%.*]] = mul i8 [[TMP3]], [[TMP2]], !dbg [[DBG56:![0-9]+]]
-; NO_HOTNESS-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i8, i8* [[C:%.*]], i64 [[INDVARS_IV]], !dbg [[DBG57:![0-9]+]]
-; NO_HOTNESS-NEXT: store i8 [[MUL]], i8* [[ARRAYIDX16]], align 1, !dbg [[DBG58:![0-9]+]], !tbaa [[TBAA14]]
-; NO_HOTNESS-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32, !dbg [[DBG47]]
-; NO_HOTNESS-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]], !dbg [[DBG47]]
-; NO_HOTNESS-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]], !dbg [[DBG47]], !llvm.loop [[LOOP59:![0-9]+]]
-;
entry:
%cmp28 = icmp sgt i32 %N, 0, !dbg !42
br i1 %cmp28, label %for.body, label %for.cond.cleanup, !dbg !43
; CHECK-NEXT: store <2 x i32> <i32 13, i32 13>, <2 x i32>* [[TMP11]], align 1
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 6
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1800
-; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 1800, 1800
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-NEXT: store i32 13, i32* [[ARRAYIDX]], align 1
; CHECK-NEXT: [[RIVPLUS1]] = add nuw nsw i32 [[RIV]], 1
; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[RIVPLUS1]], 1800
-; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], [[LOOP2:!llvm.loop !.*]]
; CHECK: exit:
; CHECK-NEXT: ret void
;
; CHECK-NEXT: store <4 x i32> <i32 13, i32 13, i32 13, i32 13>, <4 x i32>* [[TMP3]], align 1
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[ALIGNEDTC]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-NEXT: store i32 13, i32* [[ARRAYIDX]], align 1
; CHECK-NEXT: [[RIVPLUS1]] = add nuw nsw i32 [[RIV]], 1
; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[RIVPLUS1]], [[ALIGNEDTC]]
-; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], [[LOOP2:!llvm.loop !.*]]
; CHECK: exit:
; CHECK-NEXT: ret void
;
; CHECK-NEXT: store <4 x i32> <i32 13, i32 13, i32 13, i32 13>, <4 x i32>* [[TMP3]], align 1
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; CHECK-NEXT: store i32 13, i32* [[ARRAYIDX]], align 1
; CHECK-NEXT: [[RIVPLUS1]] = add nuw nsw i32 [[RIV]], 1
; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[RIVPLUS1]], [[N]]
-; CHECK-NEXT: br i1 [[COND]], label [[EXIT_LOOPEXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT: br i1 [[COND]], label [[EXIT_LOOPEXIT]], label [[LOOP]], [[LOOP5:!llvm.loop !.*]]
; CHECK: exit.loopexit:
; CHECK-NEXT: br label [[EXIT]]
; CHECK: exit:
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: br i1 true, label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: store i32 13, i32* [[ARRAYIDX]], align 1
; CHECK-NEXT: [[RIVPLUS1]] = add nuw nsw i32 [[RIV]], 1
; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[RIVPLUS1]], [[N]]
-; CHECK-NEXT: br i1 [[COND]], label [[EXIT_LOOPEXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT: br i1 [[COND]], label [[EXIT_LOOPEXIT]], label [[LOOP]], [[LOOP7:!llvm.loop !.*]]
; CHECK: exit.loopexit:
; CHECK-NEXT: br label [[EXIT]]
; CHECK: exit:
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; CHECK: vec.epilog.vector.body:
-; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i64> [ [[TMP7]], [[VEC_EPILOG_PH]] ], [ [[TMP12:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX6]], 0
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP8]]
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, i64* [[TMP9]], i32 0
; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64* [[TMP10]] to <4 x i64>*
; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i64>, <4 x i64>* [[TMP11]], align 4
; CHECK-NEXT: [[TMP12]] = add <4 x i64> [[WIDE_LOAD8]], [[VEC_PHI7]]
-; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[OFFSET_IDX]], 4
+; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX6]], 4
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC4]]
; CHECK-NEXT: br i1 [[TMP13]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
; CHECK: vec.epilog.middle.block:
; CHECK-NEXT: [[MINMAX_IDENT_SPLAT:%.*]] = shufflevector <4 x float> [[MINMAX_IDENT_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; CHECK: vec.epilog.vector.body:
-; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI7:%.*]] = phi <4 x float> [ [[MINMAX_IDENT_SPLAT]], [[VEC_EPILOG_PH]] ], [ [[TMP13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX6]], 0
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP8]]
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP9]], i32 0
; CHECK-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to <4 x float>*
; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x float>, <4 x float>* [[TMP11]], align 4
; CHECK-NEXT: [[TMP12:%.*]] = fcmp fast ogt <4 x float> [[VEC_PHI7]], [[WIDE_LOAD8]]
; CHECK-NEXT: [[TMP13]] = select <4 x i1> [[TMP12]], <4 x float> [[VEC_PHI7]], <4 x float> [[WIDE_LOAD8]]
-; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[OFFSET_IDX]], 4
+; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX6]], 4
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC4]]
; CHECK-NEXT: br i1 [[TMP14]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: vec.epilog.middle.block:
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256
; CHECK-NEXT: [[TMP8:%.*]] = trunc <4 x i32> [[TMP6]] to <4 x i16>
; CHECK-NEXT: [[TMP9]] = zext <4 x i16> [[TMP8]] to <4 x i32>
-; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[TMP10:%.*]] = trunc <4 x i32> [[TMP9]] to <4 x i16>
; CHECK-NEXT: [[TMP11:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP10]])
; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; CHECK: vec.epilog.vector.body:
-; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX3:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ [[TMP13]], [[VEC_EPILOG_PH]] ], [ [[TMP23:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[INDEX3]], 0
; CHECK-NEXT: [[TMP15:%.*]] = and <4 x i32> [[VEC_PHI4]], <i32 65535, i32 65535, i32 65535, i32 65535>
; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i32 [[TMP14]]
; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i16, i16* [[TMP16]], i32 0
; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i16>, <4 x i16>* [[TMP18]], align 2
; CHECK-NEXT: [[TMP19:%.*]] = zext <4 x i16> [[WIDE_LOAD5]] to <4 x i32>
; CHECK-NEXT: [[TMP20:%.*]] = or <4 x i32> [[TMP15]], [[TMP19]]
-; CHECK-NEXT: [[INDEX_NEXT6]] = add nuw i32 [[OFFSET_IDX]], 4
+; CHECK-NEXT: [[INDEX_NEXT6]] = add nuw i32 [[INDEX3]], 4
; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i32 [[INDEX_NEXT6]], 256
; CHECK-NEXT: [[TMP22:%.*]] = trunc <4 x i32> [[TMP20]] to <4 x i16>
; CHECK-NEXT: [[TMP23]] = zext <4 x i16> [[TMP22]] to <4 x i32>
-; CHECK-NEXT: br i1 [[TMP21]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP21]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; CHECK: vec.epilog.middle.block:
; CHECK-NEXT: [[TMP24:%.*]] = trunc <4 x i32> [[TMP23]] to <4 x i16>
; CHECK-NEXT: [[TMP25:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP24]])
; CHECK-NEXT: [[XOR]] = or i32 [[SUM_02]], [[EXT]]
; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 256
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
; CHECK: for.end.loopexit:
; CHECK-NEXT: [[XOR_LCSSA1:%.*]] = phi i32 [ [[XOR]], [[FOR_BODY]] ], [ [[TMP26]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
; CHECK-NEXT: br label [[FOR_END]]
; CHECK-NEXT: [[TMP5]] = fmul fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP4]])
; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP5]])
; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x float> zeroinitializer, float [[BC_MERGE_RDX3]], i32 0
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; CHECK: vec.epilog.vector.body:
-; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX9:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI10:%.*]] = phi <4 x float> [ [[TMP9]], [[VEC_EPILOG_PH]] ], [ [[TMP16:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI11:%.*]] = phi <4 x float> [ [[TMP10]], [[VEC_EPILOG_PH]] ], [ [[TMP15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX9]], 0
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP11]]
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 0
; CHECK-NEXT: [[TMP14:%.*]] = bitcast float* [[TMP13]] to <4 x float>*
; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, <4 x float>* [[TMP14]], align 4
; CHECK-NEXT: [[TMP15]] = fadd fast <4 x float> [[VEC_PHI11]], [[WIDE_LOAD12]]
; CHECK-NEXT: [[TMP16]] = fmul fast <4 x float> [[VEC_PHI10]], [[WIDE_LOAD12]]
-; CHECK-NEXT: [[INDEX_NEXT13]] = add nuw i64 [[OFFSET_IDX]], 4
+; CHECK-NEXT: [[INDEX_NEXT13]] = add nuw i64 [[INDEX9]], 4
; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[N_VEC7]]
-; CHECK-NEXT: br i1 [[TMP17]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP17]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
; CHECK: vec.epilog.middle.block:
; CHECK-NEXT: [[TMP18:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP15]])
; CHECK-NEXT: [[TMP19:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP16]])
; CHECK-NEXT: [[MUL]] = fmul fast float [[PROD]], [[TMP20]]
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK: for.end.loopexit:
; CHECK-NEXT: [[ADD_LCSSA5:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ], [ [[TMP18]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
; CHECK-NEXT: [[MUL_LCSSA4:%.*]] = phi float [ [[MUL]], [[FOR_BODY]] ], [ [[TMP19]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
; CHECK-NEXT: [[TMP5]] = sub <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP5]])
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; CHECK: vec.epilog.vector.body:
-; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i32> [ [[TMP8]], [[VEC_EPILOG_PH]] ], [ [[TMP13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX6]], 0
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP9]]
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP12]], align 4
; CHECK-NEXT: [[TMP13]] = sub <4 x i32> [[VEC_PHI7]], [[WIDE_LOAD8]]
-; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[OFFSET_IDX]], 4
+; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX6]], 4
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC4]]
-; CHECK-NEXT: br i1 [[TMP14]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP14]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
; CHECK: vec.epilog.middle.block:
; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP13]])
; CHECK-NEXT: [[CMP_N5:%.*]] = icmp eq i64 [[N]], [[N_VEC4]]
; CHECK-NEXT: [[SUB]] = sub nsw i32 [[SUM]], [[LOAD]]
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
; CHECK: for.cond.loopexit:
; CHECK-NEXT: [[SUB_LCSSA2:%.*]] = phi i32 [ [[SUB]], [[FOR_BODY]] ], [ [[TMP15]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
; CHECK-NEXT: br label [[FOR_COND]]
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S %s | FileCheck %s
; Avoid crashing while trying to vectorize fcmp that can be folded to vector of
; i1 true.
define void @test1() {
-; CHECK-LABEL: @test1(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 144
-; CHECK-NEXT: br i1 [[TMP0]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 144, 144
-; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 144, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[LOOP:%.*]]
-; CHECK: loop:
-; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IVNEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[FCMP:%.*]] = fcmp uno float 0.000000e+00, 0.000000e+00
-; CHECK-NEXT: [[IVNEXT]] = add nsw i32 [[IV]], 1
-; CHECK-NEXT: [[CND:%.*]] = icmp sgt i32 [[IV]], 142
-; CHECK-NEXT: br i1 [[CND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: exit:
-; CHECK-NEXT: ret void
-;
+; CHECK-LABEL: test1(
+; CHECK-LABEL: vector.body:
+; CHECK-NEXT: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK: %index.next = add nuw i32 %index, 4
entry:
br label %loop
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S %s | FileCheck %s
define void @test_chained_first_order_recurrences_1(i16* %ptr) {
-; CHECK-LABEL: @test_chained_first_order_recurrences_1(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[LOOP:%.*]]
-; CHECK: loop:
-; CHECK-NEXT: [[FOR_1:%.*]] = phi i16 [ 22, [[ENTRY:%.*]] ], [ [[FOR_1_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[FOR_2:%.*]] = phi i16 [ 33, [[ENTRY]] ], [ [[FOR_1]], [[LOOP]] ]
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[GEP_PTR:%.*]] = getelementptr inbounds i16, i16* [[PTR:%.*]], i64 [[IV]]
-; CHECK-NEXT: [[FOR_1_NEXT]] = load i16, i16* [[GEP_PTR]], align 2
-; CHECK-NEXT: [[ADD:%.*]] = add i16 [[FOR_1]], [[FOR_2]]
-; CHECK-NEXT: store i16 [[ADD]], i16* [[GEP_PTR]], align 2
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[LOOP]]
-; CHECK: exit:
-; CHECK-NEXT: ret void
+; CHECK-LABEL: @test_chained_first_order_recurrences_1
+; CHECK-NOT: vector.body:
;
entry:
br label %loop
}
define void @test_chained_first_order_recurrences_2(i16* %ptr) {
-; CHECK-LABEL: @test_chained_first_order_recurrences_2(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[LOOP:%.*]]
-; CHECK: loop:
-; CHECK-NEXT: [[FOR_2:%.*]] = phi i16 [ 33, [[ENTRY:%.*]] ], [ [[FOR_1:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[FOR_1]] = phi i16 [ 22, [[ENTRY]] ], [ [[FOR_1_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[GEP_PTR:%.*]] = getelementptr inbounds i16, i16* [[PTR:%.*]], i64 [[IV]]
-; CHECK-NEXT: [[FOR_1_NEXT]] = load i16, i16* [[GEP_PTR]], align 2
-; CHECK-NEXT: [[ADD:%.*]] = add i16 [[FOR_1]], [[FOR_2]]
-; CHECK-NEXT: store i16 [[ADD]], i16* [[GEP_PTR]], align 2
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[LOOP]]
-; CHECK: exit:
-; CHECK-NEXT: ret void
+; CHECK-LABEL: @test_chained_first_order_recurrences_2
+; CHECK-NOT: vector.body:
;
entry:
br label %loop
}
define void @test_chained_first_order_recurrences_3(i16* %ptr) {
-; CHECK-LABEL: @test_chained_first_order_recurrences_3(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[LOOP:%.*]]
-; CHECK: loop:
-; CHECK-NEXT: [[FOR_1:%.*]] = phi i16 [ 22, [[ENTRY:%.*]] ], [ [[FOR_1_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[FOR_2:%.*]] = phi i16 [ 33, [[ENTRY]] ], [ [[FOR_1]], [[LOOP]] ]
-; CHECK-NEXT: [[FOR_3:%.*]] = phi i16 [ 33, [[ENTRY]] ], [ [[FOR_2]], [[LOOP]] ]
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[GEP_PTR:%.*]] = getelementptr inbounds i16, i16* [[PTR:%.*]], i64 [[IV]]
-; CHECK-NEXT: [[FOR_1_NEXT]] = load i16, i16* [[GEP_PTR]], align 2
-; CHECK-NEXT: [[ADD_1:%.*]] = add i16 [[FOR_1]], [[FOR_2]]
-; CHECK-NEXT: [[ADD_2:%.*]] = add i16 [[ADD_1]], [[FOR_3]]
-; CHECK-NEXT: store i16 [[ADD_2]], i16* [[GEP_PTR]], align 2
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[LOOP]]
-; CHECK: exit:
-; CHECK-NEXT: ret void
+; CHECK-LABEL: @test_chained_first_order_recurrences_3
+; CHECK-NOT: vector.body:
;
entry:
br label %loop
define void @test_cyclic_phis(i16* %ptr) {
-; CHECK-LABEL: @test_cyclic_phis(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[LOOP:%.*]]
-; CHECK: loop:
-; CHECK-NEXT: [[FOR_1:%.*]] = phi i16 [ 22, [[ENTRY:%.*]] ], [ [[FOR_2:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[FOR_2]] = phi i16 [ 33, [[ENTRY]] ], [ [[FOR_1]], [[LOOP]] ]
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[GEP_PTR:%.*]] = getelementptr inbounds i16, i16* [[PTR:%.*]], i64 [[IV]]
-; CHECK-NEXT: [[FOR_1_NEXT:%.*]] = load i16, i16* [[GEP_PTR]], align 2
-; CHECK-NEXT: [[ADD:%.*]] = add i16 [[FOR_1]], [[FOR_2]]
-; CHECK-NEXT: store i16 [[ADD]], i16* [[GEP_PTR]], align 2
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[LOOP]]
-; CHECK: exit:
-; CHECK-NEXT: ret void
+; CHECK-LABEL: @test_cyclic_phis
+; CHECK-NOT: vector.body:
;
entry:
br label %loop
}
define void @test_first_order_recurrences_incoming_cycle_preheader(i16* %ptr) {
-; CHECK-LABEL: @test_first_order_recurrences_incoming_cycle_preheader(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[LOOP_1:%.*]]
-; CHECK: loop.1:
-; CHECK-NEXT: br i1 true, label [[LOOP_PREHEADER:%.*]], label [[LOOP_1]]
-; CHECK: loop.preheader:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-LABEL: @test_first_order_recurrences_incoming_cycle_preheader
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 0>, [[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i16> [ <i16 poison, i16 poison, i16 poison, i16 0>, %vector.ph ], [ [[WIDE_LOAD:%.*]], %vector.body ]
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i16, i16* [[PTR:%.*]], i64 [[TMP0]]
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, i16* [[TMP1]], i32 0
; CHECK-NEXT: store <4 x i16> [[TMP5]], <4 x i16>* [[TMP6]], align 2
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
-; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ 0, [[LOOP_PREHEADER]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ]
-; CHECK-NEXT: br label [[LOOP:%.*]]
-; CHECK: loop:
-; CHECK-NEXT: [[SCALAR_RECUR:%.*]] = phi i16 [ [[FOR_1_NEXT:%.*]], [[LOOP]] ], [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[GEP_PTR:%.*]] = getelementptr inbounds i16, i16* [[PTR]], i64 [[IV]]
-; CHECK-NEXT: [[FOR_1_NEXT]] = load i16, i16* [[GEP_PTR]], align 2
-; CHECK-NEXT: [[ADD:%.*]] = add i16 [[SCALAR_RECUR]], 10
-; CHECK-NEXT: store i16 [[ADD]], i16* [[GEP_PTR]], align 2
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: exit:
-; CHECK-NEXT: ret void
+; CHECK-NEXT: br i1 [[TMP7]], label %middle.block, label %vector.body
;
entry:
br label %loop.1
}
define void @test_chained_first_order_recurrence_sink_users_1(double* %ptr) {
-; CHECK-LABEL: @test_chained_first_order_recurrence_sink_users_1(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[LOOP:%.*]]
-; CHECK: loop:
-; CHECK-NEXT: [[FOR_1:%.*]] = phi double [ 1.000000e+01, [[ENTRY:%.*]] ], [ [[FOR_1_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[FOR_2:%.*]] = phi double [ 2.000000e+01, [[ENTRY]] ], [ [[FOR_1]], [[LOOP]] ]
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 1, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[ADD_1:%.*]] = fadd double 1.000000e+01, [[FOR_2]]
-; CHECK-NEXT: [[ADD_2:%.*]] = fadd double [[ADD_1]], [[FOR_1]]
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[GEP_PTR:%.*]] = getelementptr inbounds double, double* [[PTR:%.*]], i64 [[IV]]
-; CHECK-NEXT: [[FOR_1_NEXT]] = load double, double* [[GEP_PTR]], align 8
-; CHECK-NEXT: store double [[ADD_2]], double* [[GEP_PTR]], align 8
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[LOOP]]
-; CHECK: exit:
-; CHECK-NEXT: ret void
+; CHECK-LABEL: @test_chained_first_order_recurrence_sink_users_1
+; CHECK-NOT: vector.body:
;
entry:
br label %loop
; CHECK-NEXT: store <4 x i16> [[TMP8]], <4 x i16>* [[TMP9]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
; CHECK-NEXT: store i16 [[SCALAR_RECUR]], i16* [[B]], align 4
; CHECK-NEXT: [[INC]] = add nsw i32 [[I]], 1
; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[I]], 2096
-; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_COND]], label [[IF_END]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_COND]], label [[IF_END]], !llvm.loop [[LOOP9:![0-9]+]]
; CHECK: if.end:
; CHECK-NEXT: [[REC_LCSSA:%.*]] = phi i16 [ [[SCALAR_RECUR]], [[FOR_BODY]] ], [ [[SCALAR_RECUR]], [[FOR_COND]] ]
; CHECK-NEXT: ret i16 [[REC_LCSSA]]
; CHECK-NEXT: store <4 x i16> [[TMP8]], <4 x i16>* [[TMP9]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
; CHECK-NEXT: store i16 [[SCALAR_RECUR]], i16* [[B]], align 4
; CHECK-NEXT: [[INC]] = add nsw i32 [[I]], 1
; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[I]], 2096
-; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_COND]], label [[IF_END]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_COND]], label [[IF_END]], !llvm.loop [[LOOP11:![0-9]+]]
; CHECK: if.end:
; CHECK-NEXT: [[REC_LCSSA:%.*]] = phi i16 [ [[SCALAR_RECUR]], [[FOR_COND]] ], [ 10, [[FOR_BODY]] ]
; CHECK-NEXT: ret i16 [[REC_LCSSA]]
; CHECK: vector.scevcheck:
; CHECK-NEXT: [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[N]], i32 1)
; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[UMAX]], -1
-; CHECK-NEXT: [[TMP1:%.*]] = icmp slt i32 [[TMP0]], 0
-; CHECK-NEXT: br i1 [[TMP1]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT: [[TMP4:%.*]] = icmp slt i32 [[TMP0]], 0
+; CHECK-NEXT: br i1 [[TMP4]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[UMAX1]], 4
; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[UMAX1]], [[N_MOD_VF]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 0>, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i32 [[TMP2]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP6]] = zext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
-; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> [[TMP6]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT: [[TMP8:%.*]] = trunc <4 x i64> [[TMP7]] to <4 x i32>
-; CHECK-NEXT: [[TMP9:%.*]] = icmp slt <4 x i32> [[TMP8]], <i32 213, i32 213, i32 213, i32 213>
-; CHECK-NEXT: [[TMP10:%.*]] = select <4 x i1> [[TMP9]], <4 x i32> [[TMP8]], <4 x i32> <i32 22, i32 22, i32 22, i32 22>
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
-; CHECK-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP11]], align 4
+; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 0>, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i32 [[TMP7]]
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4
+; CHECK-NEXT: [[TMP11]] = zext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> [[TMP11]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT: [[TMP13:%.*]] = trunc <4 x i64> [[TMP12]] to <4 x i32>
+; CHECK-NEXT: [[TMP14:%.*]] = icmp slt <4 x i32> [[TMP13]], <i32 213, i32 213, i32 213, i32 213>
+; CHECK-NEXT: [[TMP15:%.*]] = select <4 x i1> [[TMP14]], <4 x i32> [[TMP13]], <4 x i32> <i32 22, i32 22, i32 22, i32 22>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
+; CHECK-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP16]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[UMAX1]], [[N_VEC]]
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP11]], i32 3
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[TMP11]], i32 2
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: store i32 [[SELECT]], i32* [[GEP]], align 4
; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1
; CHECK-NEXT: [[CMP73:%.*]] = icmp ugt i32 [[N]], [[IV_NEXT]]
-; CHECK-NEXT: br i1 [[CMP73]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK-NEXT: br i1 [[CMP73]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP13:![0-9]+]]
; CHECK: exit:
; CHECK-NEXT: ret void
;
; CHECK: vector.scevcheck:
; CHECK-NEXT: [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[N]], i32 1)
; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[UMAX]], -1
-; CHECK-NEXT: [[TMP1:%.*]] = icmp slt i32 [[TMP0]], 0
-; CHECK-NEXT: br i1 [[TMP1]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT: [[TMP4:%.*]] = icmp slt i32 [[TMP0]], 0
+; CHECK-NEXT: br i1 [[TMP4]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[UMAX1]], 4
; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[UMAX1]], [[N_MOD_VF]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 0>, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i32 [[TMP2]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP6]] = zext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
-; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> [[TMP6]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; CHECK-NEXT: [[TMP8:%.*]] = trunc <4 x i64> [[TMP7]] to <4 x i32>
-; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP8]], <i32 2, i32 2, i32 2, i32 2>
-; CHECK-NEXT: [[TMP10:%.*]] = mul <4 x i32> [[TMP9]], <i32 99, i32 99, i32 99, i32 99>
-; CHECK-NEXT: [[TMP11:%.*]] = icmp slt <4 x i32> [[TMP8]], <i32 213, i32 213, i32 213, i32 213>
-; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP8]], <4 x i32> [[TMP10]]
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
-; CHECK-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* [[TMP13]], align 4
+; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 0>, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i32 [[TMP7]]
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4
+; CHECK-NEXT: [[TMP11]] = zext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
+; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> [[TMP11]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK-NEXT: [[TMP13:%.*]] = trunc <4 x i64> [[TMP12]] to <4 x i32>
+; CHECK-NEXT: [[TMP14:%.*]] = add <4 x i32> [[TMP13]], <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[TMP15:%.*]] = mul <4 x i32> [[TMP14]], <i32 99, i32 99, i32 99, i32 99>
+; CHECK-NEXT: [[TMP16:%.*]] = icmp slt <4 x i32> [[TMP13]], <i32 213, i32 213, i32 213, i32 213>
+; CHECK-NEXT: [[TMP17:%.*]] = select <4 x i1> [[TMP16]], <4 x i32> [[TMP13]], <4 x i32> [[TMP15]]
+; CHECK-NEXT: [[TMP18:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
+; CHECK-NEXT: store <4 x i32> [[TMP17]], <4 x i32>* [[TMP18]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[UMAX1]], [[N_VEC]]
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP6]], i32 3
-; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[TMP6]], i32 2
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i64> [[TMP11]], i32 3
+; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i64> [[TMP11]], i32 2
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: store i32 [[SELECT]], i32* [[GEP]], align 4
; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1
; CHECK-NEXT: [[CMP73:%.*]] = icmp ugt i32 [[N]], [[IV_NEXT]]
-; CHECK-NEXT: br i1 [[CMP73]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-NEXT: br i1 [[CMP73]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP15:![0-9]+]]
; CHECK: exit:
; CHECK-NEXT: ret void
;
; CHECK-NEXT: [[TMP7:%.*]] = or i64 [[INDEX]], 1
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP7]]
; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4, !alias.scope !0
+; CHECK-NEXT: [[WIDE_LOAD]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP12:%.*]] = add <4 x i32> [[WIDE_LOAD]], [[TMP10]]
; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
-; CHECK-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* [[TMP13]], align 4, !alias.scope !3, !noalias !0
+; CHECK-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* [[TMP13]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; UNROLL-NEXT: [[TMP7:%.*]] = or i64 [[INDEX]], 1
; UNROLL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP7]]
; UNROLL-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>*
-; UNROLL-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4, !alias.scope !0
+; UNROLL-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4
; UNROLL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i64 4
; UNROLL-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
-; UNROLL-NEXT: [[WIDE_LOAD7]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4, !alias.scope !0
+; UNROLL-NEXT: [[WIDE_LOAD7]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4
; UNROLL-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; UNROLL-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> [[WIDE_LOAD7]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; UNROLL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
; UNROLL-NEXT: [[TMP15:%.*]] = add <4 x i32> [[WIDE_LOAD]], [[TMP12]]
; UNROLL-NEXT: [[TMP16:%.*]] = add <4 x i32> [[WIDE_LOAD7]], [[TMP13]]
; UNROLL-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP14]] to <4 x i32>*
-; UNROLL-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4, !alias.scope !3, !noalias !0
+; UNROLL-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP17]], align 4
; UNROLL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i64 4
; UNROLL-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
-; UNROLL-NEXT: store <4 x i32> [[TMP16]], <4 x i32>* [[TMP19]], align 4, !alias.scope !3, !noalias !0
+; UNROLL-NEXT: store <4 x i32> [[TMP16]], <4 x i32>* [[TMP19]], align 4
; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; UNROLL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; UNROLL-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP10]]
; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0
; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
-; UNROLL-NO-IC-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4, !alias.scope !0
+; UNROLL-NO-IC-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 4
; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <4 x i32>*
-; UNROLL-NO-IC-NEXT: [[WIDE_LOAD7]] = load <4 x i32>, <4 x i32>* [[TMP16]], align 4, !alias.scope !0
+; UNROLL-NO-IC-NEXT: [[WIDE_LOAD7]] = load <4 x i32>, <4 x i32>* [[TMP16]], align 4
; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> [[WIDE_LOAD7]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP7]]
; UNROLL-NO-IC-NEXT: [[TMP22:%.*]] = add <4 x i32> [[WIDE_LOAD7]], [[TMP18]]
; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[TMP19]], i32 0
; UNROLL-NO-IC-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
-; UNROLL-NO-IC-NEXT: store <4 x i32> [[TMP21]], <4 x i32>* [[TMP24]], align 4, !alias.scope !3, !noalias !0
+; UNROLL-NO-IC-NEXT: store <4 x i32> [[TMP21]], <4 x i32>* [[TMP24]], align 4
; UNROLL-NO-IC-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP19]], i32 4
; UNROLL-NO-IC-NEXT: [[TMP26:%.*]] = bitcast i32* [[TMP25]] to <4 x i32>*
-; UNROLL-NO-IC-NEXT: store <4 x i32> [[TMP22]], <4 x i32>* [[TMP26]], align 4, !alias.scope !3, !noalias !0
+; UNROLL-NO-IC-NEXT: store <4 x i32> [[TMP22]], <4 x i32>* [[TMP26]], align 4
; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; UNROLL-NO-IC-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; UNROLL-NO-IC-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; UNROLL-NO-VF-NEXT: [[TMP8:%.*]] = add nuw nsw i64 [[INDUCTION7]], 1
; UNROLL-NO-VF-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP7]]
; UNROLL-NO-VF-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP8]]
-; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP9]], align 4, !alias.scope !0
-; UNROLL-NO-VF-NEXT: [[TMP12]] = load i32, i32* [[TMP10]], align 4, !alias.scope !0
+; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP9]], align 4
+; UNROLL-NO-VF-NEXT: [[TMP12]] = load i32, i32* [[TMP10]], align 4
; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDUCTION]]
; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDUCTION7]]
; UNROLL-NO-VF-NEXT: [[TMP15:%.*]] = add i32 [[TMP11]], [[VECTOR_RECUR]]
; UNROLL-NO-VF-NEXT: [[TMP16:%.*]] = add i32 [[TMP12]], [[TMP11]]
-; UNROLL-NO-VF-NEXT: store i32 [[TMP15]], i32* [[TMP13]], align 4, !alias.scope !3, !noalias !0
-; UNROLL-NO-VF-NEXT: store i32 [[TMP16]], i32* [[TMP14]], align 4, !alias.scope !3, !noalias !0
+; UNROLL-NO-VF-NEXT: store i32 [[TMP15]], i32* [[TMP13]], align 4
+; UNROLL-NO-VF-NEXT: store i32 [[TMP16]], i32* [[TMP14]], align 4
; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; UNROLL-NO-VF-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; UNROLL-NO-VF-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; SINK-AFTER-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP8]]
; SINK-AFTER-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0
; SINK-AFTER-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
-; SINK-AFTER-NEXT: [[WIDE_LOAD]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4, !alias.scope !0
+; SINK-AFTER-NEXT: [[WIDE_LOAD]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4
; SINK-AFTER-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; SINK-AFTER-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP7]]
; SINK-AFTER-NEXT: [[TMP14:%.*]] = add <4 x i32> [[WIDE_LOAD]], [[TMP12]]
; SINK-AFTER-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP13]], i32 0
; SINK-AFTER-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <4 x i32>*
-; SINK-AFTER-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP16]], align 4, !alias.scope !3, !noalias !0
+; SINK-AFTER-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP16]], align 4
; SINK-AFTER-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; SINK-AFTER-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; SINK-AFTER-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR:%.*]] = phi i32 [ [[DOTPRE]], [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
; UNROLL-NO-VF-NEXT: [[VEC_PHI:%.*]] = phi i32 [ poison, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-VF-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ poison, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-VF-NEXT: [[VEC_PHI2:%.*]] = phi i32 [ poison, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
; UNROLL-NO-VF-NEXT: [[INDUCTION:%.*]] = add i64 [[INDEX]], 0
-; UNROLL-NO-VF-NEXT: [[INDUCTION2:%.*]] = add i64 [[INDEX]], 1
+; UNROLL-NO-VF-NEXT: [[INDUCTION1:%.*]] = add i64 [[INDEX]], 1
; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDUCTION]]
-; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDUCTION2]]
+; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDUCTION1]]
; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP3]], align 4
; UNROLL-NO-VF-NEXT: [[TMP6]] = load i32, i32* [[TMP4]], align 4
; UNROLL-NO-VF-NEXT: [[TMP7:%.*]] = sub nsw i32 [[TMP5]], [[VECTOR_RECUR]]
; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = select i1 [[TMP9]], i32 [[TMP7]], i32 0
; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = select i1 [[TMP10]], i32 [[TMP8]], i32 0
; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = icmp slt i32 [[VEC_PHI]], [[TMP11]]
-; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = icmp slt i32 [[VEC_PHI1]], [[TMP12]]
+; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = icmp slt i32 [[VEC_PHI2]], [[TMP12]]
; UNROLL-NO-VF-NEXT: [[TMP15]] = select i1 [[TMP13]], i32 [[VEC_PHI]], i32 [[TMP11]]
-; UNROLL-NO-VF-NEXT: [[TMP16]] = select i1 [[TMP14]], i32 [[VEC_PHI1]], i32 [[TMP12]]
+; UNROLL-NO-VF-NEXT: [[TMP16]] = select i1 [[TMP14]], i32 [[VEC_PHI2]], i32 [[TMP12]]
; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; UNROLL-NO-VF-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; UNROLL-NO-VF-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = or i64 [[INDEX]], 1
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[OFFSET_IDX]]
; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[TMP9]] to <4 x i16>*
-; CHECK-NEXT: [[WIDE_LOAD]] = load <4 x i16>, <4 x i16>* [[TMP10]], align 2, !alias.scope !11
+; CHECK-NEXT: [[WIDE_LOAD]] = load <4 x i16>, <4 x i16>* [[TMP10]], align 2
; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; CHECK-NEXT: [[TMP12:%.*]] = sitofp <4 x i16> [[WIDE_LOAD]] to <4 x double>
; CHECK-NEXT: [[TMP13:%.*]] = sitofp <4 x i16> [[TMP11]] to <4 x double>
; CHECK-NEXT: [[TMP15:%.*]] = fsub fast <4 x double> [[TMP12]], [[TMP14]]
; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[OFFSET_IDX]]
; CHECK-NEXT: [[TMP17:%.*]] = bitcast double* [[TMP16]] to <4 x double>*
-; CHECK-NEXT: store <4 x double> [[TMP15]], <4 x double>* [[TMP17]], align 8, !alias.scope !14, !noalias !11
+; CHECK-NEXT: store <4 x double> [[TMP15]], <4 x double>* [[TMP17]], align 8
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; UNROLL-NEXT: [[OFFSET_IDX:%.*]] = or i64 [[INDEX]], 1
; UNROLL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[OFFSET_IDX]]
; UNROLL-NEXT: [[TMP10:%.*]] = bitcast i16* [[TMP9]] to <4 x i16>*
-; UNROLL-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP10]], align 2, !alias.scope !11
+; UNROLL-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP10]], align 2
; UNROLL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, i16* [[TMP9]], i64 4
; UNROLL-NEXT: [[TMP12:%.*]] = bitcast i16* [[TMP11]] to <4 x i16>*
-; UNROLL-NEXT: [[WIDE_LOAD8]] = load <4 x i16>, <4 x i16>* [[TMP12]], align 2, !alias.scope !11
+; UNROLL-NEXT: [[WIDE_LOAD8]] = load <4 x i16>, <4 x i16>* [[TMP12]], align 2
; UNROLL-NEXT: [[TMP13:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; UNROLL-NEXT: [[TMP14:%.*]] = shufflevector <4 x i16> [[WIDE_LOAD]], <4 x i16> [[WIDE_LOAD8]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; UNROLL-NEXT: [[TMP15:%.*]] = sitofp <4 x i16> [[WIDE_LOAD]] to <4 x double>
; UNROLL-NEXT: [[TMP22:%.*]] = fsub fast <4 x double> [[TMP16]], [[TMP20]]
; UNROLL-NEXT: [[TMP23:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[OFFSET_IDX]]
; UNROLL-NEXT: [[TMP24:%.*]] = bitcast double* [[TMP23]] to <4 x double>*
-; UNROLL-NEXT: store <4 x double> [[TMP21]], <4 x double>* [[TMP24]], align 8, !alias.scope !14, !noalias !11
+; UNROLL-NEXT: store <4 x double> [[TMP21]], <4 x double>* [[TMP24]], align 8
; UNROLL-NEXT: [[TMP25:%.*]] = getelementptr inbounds double, double* [[TMP23]], i64 4
; UNROLL-NEXT: [[TMP26:%.*]] = bitcast double* [[TMP25]] to <4 x double>*
-; UNROLL-NEXT: store <4 x double> [[TMP22]], <4 x double>* [[TMP26]], align 8, !alias.scope !14, !noalias !11
+; UNROLL-NEXT: store <4 x double> [[TMP22]], <4 x double>* [[TMP26]], align 8
; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; UNROLL-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; UNROLL-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP8]]
; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, i16* [[TMP9]], i32 0
; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = bitcast i16* [[TMP11]] to <4 x i16>*
-; UNROLL-NO-IC-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP12]], align 2, !alias.scope !11
+; UNROLL-NO-IC-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP12]], align 2
; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = getelementptr inbounds i16, i16* [[TMP9]], i32 4
; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = bitcast i16* [[TMP13]] to <4 x i16>*
-; UNROLL-NO-IC-NEXT: [[WIDE_LOAD8]] = load <4 x i16>, <4 x i16>* [[TMP14]], align 2, !alias.scope !11
+; UNROLL-NO-IC-NEXT: [[WIDE_LOAD8]] = load <4 x i16>, <4 x i16>* [[TMP14]], align 2
; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = shufflevector <4 x i16> [[WIDE_LOAD]], <4 x i16> [[WIDE_LOAD8]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = sitofp <4 x i16> [[WIDE_LOAD]] to <4 x double>
; UNROLL-NO-IC-NEXT: [[TMP26:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP8]]
; UNROLL-NO-IC-NEXT: [[TMP27:%.*]] = getelementptr inbounds double, double* [[TMP25]], i32 0
; UNROLL-NO-IC-NEXT: [[TMP28:%.*]] = bitcast double* [[TMP27]] to <4 x double>*
-; UNROLL-NO-IC-NEXT: store <4 x double> [[TMP23]], <4 x double>* [[TMP28]], align 8, !alias.scope !14, !noalias !11
+; UNROLL-NO-IC-NEXT: store <4 x double> [[TMP23]], <4 x double>* [[TMP28]], align 8
; UNROLL-NO-IC-NEXT: [[TMP29:%.*]] = getelementptr inbounds double, double* [[TMP25]], i32 4
; UNROLL-NO-IC-NEXT: [[TMP30:%.*]] = bitcast double* [[TMP29]] to <4 x double>*
-; UNROLL-NO-IC-NEXT: store <4 x double> [[TMP24]], <4 x double>* [[TMP30]], align 8, !alias.scope !14, !noalias !11
+; UNROLL-NO-IC-NEXT: store <4 x double> [[TMP24]], <4 x double>* [[TMP30]], align 8
; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; UNROLL-NO-IC-NEXT: [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; UNROLL-NO-IC-NEXT: br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; UNROLL-NO-VF-NEXT: [[INDUCTION8:%.*]] = add i64 [[OFFSET_IDX]], 1
; UNROLL-NO-VF-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[INDUCTION]]
; UNROLL-NO-VF-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[INDUCTION8]]
-; UNROLL-NO-VF-NEXT: [[TMP9:%.*]] = load i16, i16* [[TMP7]], align 2, !alias.scope !10
-; UNROLL-NO-VF-NEXT: [[TMP10]] = load i16, i16* [[TMP8]], align 2, !alias.scope !10
+; UNROLL-NO-VF-NEXT: [[TMP9:%.*]] = load i16, i16* [[TMP7]], align 2
+; UNROLL-NO-VF-NEXT: [[TMP10]] = load i16, i16* [[TMP8]], align 2
; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = sitofp i16 [[TMP9]] to double
; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = sitofp i16 [[TMP10]] to double
; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = sitofp i16 [[VECTOR_RECUR]] to double
; UNROLL-NO-VF-NEXT: [[TMP18:%.*]] = fsub fast double [[TMP12]], [[TMP16]]
; UNROLL-NO-VF-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDUCTION]]
; UNROLL-NO-VF-NEXT: [[TMP20:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDUCTION8]]
-; UNROLL-NO-VF-NEXT: store double [[TMP17]], double* [[TMP19]], align 8, !alias.scope !13, !noalias !10
-; UNROLL-NO-VF-NEXT: store double [[TMP18]], double* [[TMP20]], align 8, !alias.scope !13, !noalias !10
+; UNROLL-NO-VF-NEXT: store double [[TMP17]], double* [[TMP19]], align 8
+; UNROLL-NO-VF-NEXT: store double [[TMP18]], double* [[TMP20]], align 8
; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; UNROLL-NO-VF-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; UNROLL-NO-VF-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
; SINK-AFTER-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP7]]
; SINK-AFTER-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, i16* [[TMP8]], i32 0
; SINK-AFTER-NEXT: [[TMP10:%.*]] = bitcast i16* [[TMP9]] to <4 x i16>*
-; SINK-AFTER-NEXT: [[WIDE_LOAD]] = load <4 x i16>, <4 x i16>* [[TMP10]], align 2, !alias.scope !11
+; SINK-AFTER-NEXT: [[WIDE_LOAD]] = load <4 x i16>, <4 x i16>* [[TMP10]], align 2
; SINK-AFTER-NEXT: [[TMP11:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; SINK-AFTER-NEXT: [[TMP12:%.*]] = sitofp <4 x i16> [[WIDE_LOAD]] to <4 x double>
; SINK-AFTER-NEXT: [[TMP13:%.*]] = sitofp <4 x i16> [[TMP11]] to <4 x double>
; SINK-AFTER-NEXT: [[TMP16:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP7]]
; SINK-AFTER-NEXT: [[TMP17:%.*]] = getelementptr inbounds double, double* [[TMP16]], i32 0
; SINK-AFTER-NEXT: [[TMP18:%.*]] = bitcast double* [[TMP17]] to <4 x double>*
-; SINK-AFTER-NEXT: store <4 x double> [[TMP15]], <4 x double>* [[TMP18]], align 8, !alias.scope !14, !noalias !11
+; SINK-AFTER-NEXT: store <4 x double> [[TMP15]], <4 x double>* [[TMP18]], align 8
; SINK-AFTER-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; SINK-AFTER-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; SINK-AFTER-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; UNROLL-NO-IC: vector.body:
; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 0>, [[VECTOR_PH]] ], [ <i64 1, i64 1, i64 1, i64 1>, [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT: [[TMP0:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> <i64 1, i64 1, i64 1, i64 1>, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> <i64 1, i64 1, i64 1, i64 1>, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; UNROLL-NO-IC-NEXT: [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef
-; UNROLL-NO-IC-NEXT: br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef
+; UNROLL-NO-IC-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
; UNROLL-NO-IC: middle.block:
; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i64 undef, undef
; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
; SINK-AFTER: vector.body:
; SINK-AFTER-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; SINK-AFTER-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 0>, [[VECTOR_PH]] ], [ <i64 1, i64 1, i64 1, i64 1>, [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT: [[TMP0:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> <i64 1, i64 1, i64 1, i64 1>, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; SINK-AFTER-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> <i64 1, i64 1, i64 1, i64 1>, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; SINK-AFTER-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; SINK-AFTER-NEXT: [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef
-; SINK-AFTER-NEXT: br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; SINK-AFTER-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef
+; SINK-AFTER-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
; SINK-AFTER: middle.block:
; SINK-AFTER-NEXT: [[CMP_N:%.*]] = icmp eq i64 undef, undef
; SINK-AFTER-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
; UNROLL-NO-IC: vector.body:
; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; UNROLL-NO-IC-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
; UNROLL-NO-IC-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; UNROLL-NO-IC-NEXT: [[TMP0:%.*]] = add <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; UNROLL-NO-IC-NEXT: [[TMP1]] = add <4 x i32> [[STEP_ADD]], [[BROADCAST_SPLAT3]]
-; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP0]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = add <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; UNROLL-NO-IC-NEXT: [[TMP11]] = add <4 x i32> [[STEP_ADD]], [[BROADCAST_SPLAT3]]
+; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP10]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], <i32 4, i32 4, i32 4, i32 4>
-; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 96
-; UNROLL-NO-IC-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], 96
+; UNROLL-NO-IC-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
; UNROLL-NO-IC: middle.block:
; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i32 96, 96
-; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
-; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
+; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP11]], i32 3
+; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP11]], i32 2
; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
; UNROLL-NO-IC: scalar.ph:
; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
; UNROLL-NO-VF-NEXT: br label [[VECTOR_BODY:%.*]]
; UNROLL-NO-VF: vector.body:
; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
; UNROLL-NO-VF-NEXT: [[INDUCTION:%.*]] = add i32 [[INDEX]], 0
; UNROLL-NO-VF-NEXT: [[INDUCTION1:%.*]] = add i32 [[INDEX]], 1
-; UNROLL-NO-VF-NEXT: [[TMP0:%.*]] = add i32 [[INDUCTION]], [[X:%.*]]
-; UNROLL-NO-VF-NEXT: [[TMP1]] = add i32 [[INDUCTION1]], [[X]]
+; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = add i32 [[INDUCTION]], [[X:%.*]]
+; UNROLL-NO-VF-NEXT: [[TMP3]] = add i32 [[INDUCTION1]], [[X]]
; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 96
-; UNROLL-NO-VF-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 96
+; UNROLL-NO-VF-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
; UNROLL-NO-VF: middle.block:
; UNROLL-NO-VF-NEXT: [[CMP_N:%.*]] = icmp eq i32 96, 96
; UNROLL-NO-VF-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
; UNROLL-NO-VF: scalar.ph:
-; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP1]], [[MIDDLE_BLOCK]] ]
+; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP3]], [[MIDDLE_BLOCK]] ]
; UNROLL-NO-VF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 96, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
; UNROLL-NO-VF-NEXT: br label [[FOR_BODY:%.*]]
; UNROLL-NO-VF: for.body:
; UNROLL-NO-VF-NEXT: [[CMP:%.*]] = icmp eq i32 [[INC_PHI]], 95
; UNROLL-NO-VF-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
; UNROLL-NO-VF: for.end:
-; UNROLL-NO-VF-NEXT: [[VAL_PHI_LCSSA:%.*]] = phi i32 [ [[SCALAR_RECUR]], [[FOR_BODY]] ], [ [[TMP0]], [[MIDDLE_BLOCK]] ]
+; UNROLL-NO-VF-NEXT: [[VAL_PHI_LCSSA:%.*]] = phi i32 [ [[SCALAR_RECUR]], [[FOR_BODY]] ], [ [[TMP2]], [[MIDDLE_BLOCK]] ]
; UNROLL-NO-VF-NEXT: ret i32 [[VAL_PHI_LCSSA]]
;
; SINK-AFTER-LABEL: @extract_second_last_iteration(
; SINK-AFTER: vector.body:
; SINK-AFTER-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; SINK-AFTER-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP0:%.*]], [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT: [[TMP0]] = add <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; SINK-AFTER-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP0]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; SINK-AFTER-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; SINK-AFTER-NEXT: [[TMP5]] = add <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; SINK-AFTER-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP5]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; SINK-AFTER-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; SINK-AFTER-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; SINK-AFTER-NEXT: [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 96
-; SINK-AFTER-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; SINK-AFTER-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 96
+; SINK-AFTER-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
; SINK-AFTER: middle.block:
; SINK-AFTER-NEXT: [[CMP_N:%.*]] = icmp eq i32 96, 96
-; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP0]], i32 3
-; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP0]], i32 2
+; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP5]], i32 3
+; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP5]], i32 2
; SINK-AFTER-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
; SINK-AFTER: scalar.ph:
; SINK-AFTER-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
; UNROLL-NO-IC-NEXT: br label [[VECTOR_BODY:%.*]]
; UNROLL-NO-IC: vector.body:
; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP48:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT: [[VEC_PHI9:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP49:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x double> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP39:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-IC-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP50:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-IC-NEXT: [[VEC_PHI9:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP51:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x double> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP41:%.*]], [[VECTOR_BODY]] ]
; UNROLL-NO-IC-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; UNROLL-NO-IC-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 25
; UNROLL-NO-IC-NEXT: [[NEXT_GEP:%.*]] = getelementptr double, double* [[B]], i64 [[TMP1]]
; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 7
; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = mul i64 [[TMP14]], 25
; UNROLL-NO-IC-NEXT: [[NEXT_GEP8:%.*]] = getelementptr double, double* [[B]], i64 [[TMP15]]
-; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = getelementptr inbounds double, double* [[NEXT_GEP]], i64 [[IDXPROM]]
-; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = getelementptr inbounds double, double* [[NEXT_GEP2]], i64 [[IDXPROM]]
-; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = getelementptr inbounds double, double* [[NEXT_GEP3]], i64 [[IDXPROM]]
-; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, double* [[NEXT_GEP4]], i64 [[IDXPROM]]
-; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = getelementptr inbounds double, double* [[NEXT_GEP5]], i64 [[IDXPROM]]
-; UNROLL-NO-IC-NEXT: [[TMP21:%.*]] = getelementptr inbounds double, double* [[NEXT_GEP6]], i64 [[IDXPROM]]
-; UNROLL-NO-IC-NEXT: [[TMP22:%.*]] = getelementptr inbounds double, double* [[NEXT_GEP7]], i64 [[IDXPROM]]
-; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = getelementptr inbounds double, double* [[NEXT_GEP8]], i64 [[IDXPROM]]
-; UNROLL-NO-IC-NEXT: [[TMP24:%.*]] = load double, double* [[TMP16]], align 8
-; UNROLL-NO-IC-NEXT: [[TMP25:%.*]] = load double, double* [[TMP17]], align 8
+; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = getelementptr inbounds double, double* [[NEXT_GEP]], i64 [[IDXPROM]]
+; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, double* [[NEXT_GEP2]], i64 [[IDXPROM]]
+; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = getelementptr inbounds double, double* [[NEXT_GEP3]], i64 [[IDXPROM]]
+; UNROLL-NO-IC-NEXT: [[TMP21:%.*]] = getelementptr inbounds double, double* [[NEXT_GEP4]], i64 [[IDXPROM]]
+; UNROLL-NO-IC-NEXT: [[TMP22:%.*]] = getelementptr inbounds double, double* [[NEXT_GEP5]], i64 [[IDXPROM]]
+; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = getelementptr inbounds double, double* [[NEXT_GEP6]], i64 [[IDXPROM]]
+; UNROLL-NO-IC-NEXT: [[TMP24:%.*]] = getelementptr inbounds double, double* [[NEXT_GEP7]], i64 [[IDXPROM]]
+; UNROLL-NO-IC-NEXT: [[TMP25:%.*]] = getelementptr inbounds double, double* [[NEXT_GEP8]], i64 [[IDXPROM]]
; UNROLL-NO-IC-NEXT: [[TMP26:%.*]] = load double, double* [[TMP18]], align 8
; UNROLL-NO-IC-NEXT: [[TMP27:%.*]] = load double, double* [[TMP19]], align 8
-; UNROLL-NO-IC-NEXT: [[TMP28:%.*]] = insertelement <4 x double> poison, double [[TMP24]], i32 0
-; UNROLL-NO-IC-NEXT: [[TMP29:%.*]] = insertelement <4 x double> [[TMP28]], double [[TMP25]], i32 1
-; UNROLL-NO-IC-NEXT: [[TMP30:%.*]] = insertelement <4 x double> [[TMP29]], double [[TMP26]], i32 2
-; UNROLL-NO-IC-NEXT: [[TMP31:%.*]] = insertelement <4 x double> [[TMP30]], double [[TMP27]], i32 3
-; UNROLL-NO-IC-NEXT: [[TMP32:%.*]] = load double, double* [[TMP20]], align 8
-; UNROLL-NO-IC-NEXT: [[TMP33:%.*]] = load double, double* [[TMP21]], align 8
+; UNROLL-NO-IC-NEXT: [[TMP28:%.*]] = load double, double* [[TMP20]], align 8
+; UNROLL-NO-IC-NEXT: [[TMP29:%.*]] = load double, double* [[TMP21]], align 8
+; UNROLL-NO-IC-NEXT: [[TMP30:%.*]] = insertelement <4 x double> poison, double [[TMP26]], i32 0
+; UNROLL-NO-IC-NEXT: [[TMP31:%.*]] = insertelement <4 x double> [[TMP30]], double [[TMP27]], i32 1
+; UNROLL-NO-IC-NEXT: [[TMP32:%.*]] = insertelement <4 x double> [[TMP31]], double [[TMP28]], i32 2
+; UNROLL-NO-IC-NEXT: [[TMP33:%.*]] = insertelement <4 x double> [[TMP32]], double [[TMP29]], i32 3
; UNROLL-NO-IC-NEXT: [[TMP34:%.*]] = load double, double* [[TMP22]], align 8
; UNROLL-NO-IC-NEXT: [[TMP35:%.*]] = load double, double* [[TMP23]], align 8
-; UNROLL-NO-IC-NEXT: [[TMP36:%.*]] = insertelement <4 x double> poison, double [[TMP32]], i32 0
-; UNROLL-NO-IC-NEXT: [[TMP37:%.*]] = insertelement <4 x double> [[TMP36]], double [[TMP33]], i32 1
-; UNROLL-NO-IC-NEXT: [[TMP38:%.*]] = insertelement <4 x double> [[TMP37]], double [[TMP34]], i32 2
-; UNROLL-NO-IC-NEXT: [[TMP39]] = insertelement <4 x double> [[TMP38]], double [[TMP35]], i32 3
-; UNROLL-NO-IC-NEXT: [[TMP40:%.*]] = shufflevector <4 x double> [[VECTOR_RECUR]], <4 x double> [[TMP31]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; UNROLL-NO-IC-NEXT: [[TMP41:%.*]] = shufflevector <4 x double> [[TMP31]], <4 x double> [[TMP39]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; UNROLL-NO-IC-NEXT: [[TMP42:%.*]] = fmul <4 x double> [[TMP40]], [[TMP31]]
-; UNROLL-NO-IC-NEXT: [[TMP43:%.*]] = fmul <4 x double> [[TMP41]], [[TMP39]]
-; UNROLL-NO-IC-NEXT: [[TMP44:%.*]] = fcmp une <4 x double> [[TMP42]], zeroinitializer
-; UNROLL-NO-IC-NEXT: [[TMP45:%.*]] = fcmp une <4 x double> [[TMP43]], zeroinitializer
-; UNROLL-NO-IC-NEXT: [[TMP46:%.*]] = zext <4 x i1> [[TMP44]] to <4 x i32>
-; UNROLL-NO-IC-NEXT: [[TMP47:%.*]] = zext <4 x i1> [[TMP45]] to <4 x i32>
-; UNROLL-NO-IC-NEXT: [[TMP48]] = add <4 x i32> [[VEC_PHI]], [[TMP46]]
-; UNROLL-NO-IC-NEXT: [[TMP49]] = add <4 x i32> [[VEC_PHI9]], [[TMP47]]
+; UNROLL-NO-IC-NEXT: [[TMP36:%.*]] = load double, double* [[TMP24]], align 8
+; UNROLL-NO-IC-NEXT: [[TMP37:%.*]] = load double, double* [[TMP25]], align 8
+; UNROLL-NO-IC-NEXT: [[TMP38:%.*]] = insertelement <4 x double> poison, double [[TMP34]], i32 0
+; UNROLL-NO-IC-NEXT: [[TMP39:%.*]] = insertelement <4 x double> [[TMP38]], double [[TMP35]], i32 1
+; UNROLL-NO-IC-NEXT: [[TMP40:%.*]] = insertelement <4 x double> [[TMP39]], double [[TMP36]], i32 2
+; UNROLL-NO-IC-NEXT: [[TMP41]] = insertelement <4 x double> [[TMP40]], double [[TMP37]], i32 3
+; UNROLL-NO-IC-NEXT: [[TMP42:%.*]] = shufflevector <4 x double> [[VECTOR_RECUR]], <4 x double> [[TMP33]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL-NO-IC-NEXT: [[TMP43:%.*]] = shufflevector <4 x double> [[TMP33]], <4 x double> [[TMP41]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL-NO-IC-NEXT: [[TMP44:%.*]] = fmul <4 x double> [[TMP42]], [[TMP33]]
+; UNROLL-NO-IC-NEXT: [[TMP45:%.*]] = fmul <4 x double> [[TMP43]], [[TMP41]]
+; UNROLL-NO-IC-NEXT: [[TMP46:%.*]] = fcmp une <4 x double> [[TMP44]], zeroinitializer
+; UNROLL-NO-IC-NEXT: [[TMP47:%.*]] = fcmp une <4 x double> [[TMP45]], zeroinitializer
+; UNROLL-NO-IC-NEXT: [[TMP48:%.*]] = zext <4 x i1> [[TMP46]] to <4 x i32>
+; UNROLL-NO-IC-NEXT: [[TMP49:%.*]] = zext <4 x i1> [[TMP47]] to <4 x i32>
+; UNROLL-NO-IC-NEXT: [[TMP50]] = add <4 x i32> [[VEC_PHI]], [[TMP48]]
+; UNROLL-NO-IC-NEXT: [[TMP51]] = add <4 x i32> [[VEC_PHI9]], [[TMP49]]
; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; UNROLL-NO-IC-NEXT: [[TMP50:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10240
-; UNROLL-NO-IC-NEXT: br i1 [[TMP50]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; UNROLL-NO-IC-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10240
+; UNROLL-NO-IC-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
; UNROLL-NO-IC: middle.block:
-; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP49]], [[TMP48]]
-; UNROLL-NO-IC-NEXT: [[TMP51:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
+; UNROLL-NO-IC-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP51]], [[TMP50]]
+; UNROLL-NO-IC-NEXT: [[TMP53:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i64 10240, 10240
-; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x double> [[TMP39]], i32 3
-; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x double> [[TMP39]], i32 2
+; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x double> [[TMP41]], i32 3
+; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x double> [[TMP41]], i32 2
; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; UNROLL-NO-IC: scalar.ph:
; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi double [ [[J]], [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL:%.*]] = phi double* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[B]], [[ENTRY]] ]
; UNROLL-NO-IC-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 10240, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; UNROLL-NO-IC-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP51]], [[MIDDLE_BLOCK]] ]
+; UNROLL-NO-IC-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP53]], [[MIDDLE_BLOCK]] ]
; UNROLL-NO-IC-NEXT: br label [[FOR_BODY:%.*]]
; UNROLL-NO-IC: for.cond.cleanup:
-; UNROLL-NO-IC-NEXT: [[A_1_LCSSA:%.*]] = phi i32 [ [[A_1:%.*]], [[FOR_BODY]] ], [ [[TMP51]], [[MIDDLE_BLOCK]] ]
+; UNROLL-NO-IC-NEXT: [[A_1_LCSSA:%.*]] = phi i32 [ [[A_1:%.*]], [[FOR_BODY]] ], [ [[TMP53]], [[MIDDLE_BLOCK]] ]
; UNROLL-NO-IC-NEXT: ret i32 [[A_1_LCSSA]]
; UNROLL-NO-IC: for.body:
; UNROLL-NO-IC-NEXT: [[B_ADDR_012:%.*]] = phi double* [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD_PTR:%.*]], [[FOR_BODY]] ]
; UNROLL-NO-IC-NEXT: [[I_011:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INC1:%.*]], [[FOR_BODY]] ]
; UNROLL-NO-IC-NEXT: [[A_010:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[A_1]], [[FOR_BODY]] ]
-; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR:%.*]] = phi double [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP52:%.*]], [[FOR_BODY]] ]
+; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR:%.*]] = phi double [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP54:%.*]], [[FOR_BODY]] ]
; UNROLL-NO-IC-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[B_ADDR_012]], i64 [[IDXPROM]]
-; UNROLL-NO-IC-NEXT: [[TMP52]] = load double, double* [[ARRAYIDX]], align 8
-; UNROLL-NO-IC-NEXT: [[MUL:%.*]] = fmul double [[SCALAR_RECUR]], [[TMP52]]
+; UNROLL-NO-IC-NEXT: [[TMP54]] = load double, double* [[ARRAYIDX]], align 8
+; UNROLL-NO-IC-NEXT: [[MUL:%.*]] = fmul double [[SCALAR_RECUR]], [[TMP54]]
; UNROLL-NO-IC-NEXT: [[TOBOOL:%.*]] = fcmp une double [[MUL]], 0.000000e+00
; UNROLL-NO-IC-NEXT: [[INC:%.*]] = zext i1 [[TOBOOL]] to i32
; UNROLL-NO-IC-NEXT: [[A_1]] = add nsw i32 [[A_010]], [[INC]]
; UNROLL-NO-VF: vector.body:
; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; UNROLL-NO-VF-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-VF-NEXT: [[VEC_PHI3:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-VF-NEXT: [[VEC_PHI4:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR:%.*]] = phi double [ [[J:%.*]], [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
; UNROLL-NO-VF-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; UNROLL-NO-VF-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 25
; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = zext i1 [[TMP10]] to i32
; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = zext i1 [[TMP11]] to i32
; UNROLL-NO-VF-NEXT: [[TMP14]] = add i32 [[VEC_PHI]], [[TMP12]]
-; UNROLL-NO-VF-NEXT: [[TMP15]] = add i32 [[VEC_PHI3]], [[TMP13]]
+; UNROLL-NO-VF-NEXT: [[TMP15]] = add i32 [[VEC_PHI4]], [[TMP13]]
; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; UNROLL-NO-VF-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10240
; UNROLL-NO-VF-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
; SINK-AFTER-NEXT: br label [[VECTOR_BODY:%.*]]
; SINK-AFTER: vector.body:
; SINK-AFTER-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x double> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; SINK-AFTER-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
+; SINK-AFTER-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x double> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
; SINK-AFTER-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; SINK-AFTER-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 25
; SINK-AFTER-NEXT: [[NEXT_GEP:%.*]] = getelementptr double, double* [[B]], i64 [[TMP1]]
; SINK-AFTER-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 3
; SINK-AFTER-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 25
; SINK-AFTER-NEXT: [[NEXT_GEP4:%.*]] = getelementptr double, double* [[B]], i64 [[TMP7]]
-; SINK-AFTER-NEXT: [[TMP8:%.*]] = getelementptr inbounds double, double* [[NEXT_GEP]], i64 [[IDXPROM]]
-; SINK-AFTER-NEXT: [[TMP9:%.*]] = getelementptr inbounds double, double* [[NEXT_GEP2]], i64 [[IDXPROM]]
-; SINK-AFTER-NEXT: [[TMP10:%.*]] = getelementptr inbounds double, double* [[NEXT_GEP3]], i64 [[IDXPROM]]
-; SINK-AFTER-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, double* [[NEXT_GEP4]], i64 [[IDXPROM]]
-; SINK-AFTER-NEXT: [[TMP12:%.*]] = load double, double* [[TMP8]], align 8
+; SINK-AFTER-NEXT: [[TMP9:%.*]] = getelementptr inbounds double, double* [[NEXT_GEP]], i64 [[IDXPROM]]
+; SINK-AFTER-NEXT: [[TMP10:%.*]] = getelementptr inbounds double, double* [[NEXT_GEP2]], i64 [[IDXPROM]]
+; SINK-AFTER-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, double* [[NEXT_GEP3]], i64 [[IDXPROM]]
+; SINK-AFTER-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, double* [[NEXT_GEP4]], i64 [[IDXPROM]]
; SINK-AFTER-NEXT: [[TMP13:%.*]] = load double, double* [[TMP9]], align 8
; SINK-AFTER-NEXT: [[TMP14:%.*]] = load double, double* [[TMP10]], align 8
; SINK-AFTER-NEXT: [[TMP15:%.*]] = load double, double* [[TMP11]], align 8
-; SINK-AFTER-NEXT: [[TMP16:%.*]] = insertelement <4 x double> poison, double [[TMP12]], i32 0
-; SINK-AFTER-NEXT: [[TMP17:%.*]] = insertelement <4 x double> [[TMP16]], double [[TMP13]], i32 1
-; SINK-AFTER-NEXT: [[TMP18:%.*]] = insertelement <4 x double> [[TMP17]], double [[TMP14]], i32 2
-; SINK-AFTER-NEXT: [[TMP19]] = insertelement <4 x double> [[TMP18]], double [[TMP15]], i32 3
-; SINK-AFTER-NEXT: [[TMP20:%.*]] = shufflevector <4 x double> [[VECTOR_RECUR]], <4 x double> [[TMP19]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; SINK-AFTER-NEXT: [[TMP21:%.*]] = fmul <4 x double> [[TMP20]], [[TMP19]]
-; SINK-AFTER-NEXT: [[TMP22:%.*]] = fcmp une <4 x double> [[TMP21]], zeroinitializer
-; SINK-AFTER-NEXT: [[TMP23:%.*]] = zext <4 x i1> [[TMP22]] to <4 x i32>
-; SINK-AFTER-NEXT: [[TMP24]] = add <4 x i32> [[VEC_PHI]], [[TMP23]]
+; SINK-AFTER-NEXT: [[TMP16:%.*]] = load double, double* [[TMP12]], align 8
+; SINK-AFTER-NEXT: [[TMP17:%.*]] = insertelement <4 x double> poison, double [[TMP13]], i32 0
+; SINK-AFTER-NEXT: [[TMP18:%.*]] = insertelement <4 x double> [[TMP17]], double [[TMP14]], i32 1
+; SINK-AFTER-NEXT: [[TMP19:%.*]] = insertelement <4 x double> [[TMP18]], double [[TMP15]], i32 2
+; SINK-AFTER-NEXT: [[TMP20]] = insertelement <4 x double> [[TMP19]], double [[TMP16]], i32 3
+; SINK-AFTER-NEXT: [[TMP21:%.*]] = shufflevector <4 x double> [[VECTOR_RECUR]], <4 x double> [[TMP20]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; SINK-AFTER-NEXT: [[TMP22:%.*]] = fmul <4 x double> [[TMP21]], [[TMP20]]
+; SINK-AFTER-NEXT: [[TMP23:%.*]] = fcmp une <4 x double> [[TMP22]], zeroinitializer
+; SINK-AFTER-NEXT: [[TMP24:%.*]] = zext <4 x i1> [[TMP23]] to <4 x i32>
+; SINK-AFTER-NEXT: [[TMP25]] = add <4 x i32> [[VEC_PHI]], [[TMP24]]
; SINK-AFTER-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; SINK-AFTER-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10240
-; SINK-AFTER-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; SINK-AFTER-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10240
+; SINK-AFTER-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
; SINK-AFTER: middle.block:
-; SINK-AFTER-NEXT: [[TMP26:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP24]])
+; SINK-AFTER-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP25]])
; SINK-AFTER-NEXT: [[CMP_N:%.*]] = icmp eq i64 10240, 10240
-; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x double> [[TMP19]], i32 3
-; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x double> [[TMP19]], i32 2
+; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x double> [[TMP20]], i32 3
+; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x double> [[TMP20]], i32 2
; SINK-AFTER-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
; SINK-AFTER: scalar.ph:
; SINK-AFTER-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi double [ [[J]], [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
; SINK-AFTER-NEXT: [[BC_RESUME_VAL:%.*]] = phi double* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[B]], [[ENTRY]] ]
; SINK-AFTER-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 10240, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; SINK-AFTER-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
+; SINK-AFTER-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
; SINK-AFTER-NEXT: br label [[FOR_BODY:%.*]]
; SINK-AFTER: for.cond.cleanup:
-; SINK-AFTER-NEXT: [[A_1_LCSSA:%.*]] = phi i32 [ [[A_1:%.*]], [[FOR_BODY]] ], [ [[TMP26]], [[MIDDLE_BLOCK]] ]
+; SINK-AFTER-NEXT: [[A_1_LCSSA:%.*]] = phi i32 [ [[A_1:%.*]], [[FOR_BODY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
; SINK-AFTER-NEXT: ret i32 [[A_1_LCSSA]]
; SINK-AFTER: for.body:
; SINK-AFTER-NEXT: [[B_ADDR_012:%.*]] = phi double* [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD_PTR:%.*]], [[FOR_BODY]] ]
; SINK-AFTER-NEXT: [[I_011:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INC1:%.*]], [[FOR_BODY]] ]
; SINK-AFTER-NEXT: [[A_010:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[A_1]], [[FOR_BODY]] ]
-; SINK-AFTER-NEXT: [[SCALAR_RECUR:%.*]] = phi double [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP27:%.*]], [[FOR_BODY]] ]
+; SINK-AFTER-NEXT: [[SCALAR_RECUR:%.*]] = phi double [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[TMP28:%.*]], [[FOR_BODY]] ]
; SINK-AFTER-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[B_ADDR_012]], i64 [[IDXPROM]]
-; SINK-AFTER-NEXT: [[TMP27]] = load double, double* [[ARRAYIDX]], align 8
-; SINK-AFTER-NEXT: [[MUL:%.*]] = fmul double [[SCALAR_RECUR]], [[TMP27]]
+; SINK-AFTER-NEXT: [[TMP28]] = load double, double* [[ARRAYIDX]], align 8
+; SINK-AFTER-NEXT: [[MUL:%.*]] = fmul double [[SCALAR_RECUR]], [[TMP28]]
; SINK-AFTER-NEXT: [[TOBOOL:%.*]] = fcmp une double [[MUL]], 0.000000e+00
; SINK-AFTER-NEXT: [[INC:%.*]] = zext i1 [[TOBOOL]] to i32
; SINK-AFTER-NEXT: [[A_1]] = add nsw i32 [[A_010]], [[INC]]
; CHECK-NEXT: [[TMP3:%.*]] = or i64 [[INDEX]], 1
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP3]]
; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16* [[TMP4]] to <4 x i16>*
-; CHECK-NEXT: [[WIDE_LOAD]] = load <4 x i16>, <4 x i16>* [[TMP5]], align 2, !alias.scope !26
+; CHECK-NEXT: [[WIDE_LOAD]] = load <4 x i16>, <4 x i16>* [[TMP5]], align 2
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; CHECK-NEXT: [[TMP7:%.*]] = sext <4 x i16> [[TMP6]] to <4 x i32>
; CHECK-NEXT: [[TMP8:%.*]] = sext <4 x i16> [[WIDE_LOAD]] to <4 x i32>
; CHECK-NEXT: [[TMP9:%.*]] = mul nsw <4 x i32> [[TMP8]], [[TMP7]]
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
-; CHECK-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* [[TMP11]], align 4, !alias.scope !29, !noalias !26
+; CHECK-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* [[TMP11]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]]
; UNROLL-NEXT: [[TMP3:%.*]] = or i64 [[INDEX]], 1
; UNROLL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP3]]
; UNROLL-NEXT: [[TMP5:%.*]] = bitcast i16* [[TMP4]] to <4 x i16>*
-; UNROLL-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP5]], align 2, !alias.scope !26
+; UNROLL-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP5]], align 2
; UNROLL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP4]], i64 4
; UNROLL-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <4 x i16>*
-; UNROLL-NEXT: [[WIDE_LOAD7]] = load <4 x i16>, <4 x i16>* [[TMP7]], align 2, !alias.scope !26
+; UNROLL-NEXT: [[WIDE_LOAD7]] = load <4 x i16>, <4 x i16>* [[TMP7]], align 2
; UNROLL-NEXT: [[TMP8:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; UNROLL-NEXT: [[TMP9:%.*]] = shufflevector <4 x i16> [[WIDE_LOAD]], <4 x i16> [[WIDE_LOAD7]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; UNROLL-NEXT: [[TMP10:%.*]] = sext <4 x i16> [[TMP8]] to <4 x i32>
; UNROLL-NEXT: [[TMP15:%.*]] = mul nsw <4 x i32> [[TMP13]], [[TMP11]]
; UNROLL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
; UNROLL-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
-; UNROLL-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP17]], align 4, !alias.scope !29, !noalias !26
+; UNROLL-NEXT: store <4 x i32> [[TMP14]], <4 x i32>* [[TMP17]], align 4
; UNROLL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP16]], i64 4
; UNROLL-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
-; UNROLL-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP19]], align 4, !alias.scope !29, !noalias !26
+; UNROLL-NEXT: store <4 x i32> [[TMP15]], <4 x i32>* [[TMP19]], align 4
; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; UNROLL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; UNROLL-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]]
; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP4]]
; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 0
; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = bitcast i16* [[TMP7]] to <4 x i16>*
-; UNROLL-NO-IC-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP8]], align 2, !alias.scope !26
+; UNROLL-NO-IC-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP8]], align 2
; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 4
; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = bitcast i16* [[TMP9]] to <4 x i16>*
-; UNROLL-NO-IC-NEXT: [[WIDE_LOAD7]] = load <4 x i16>, <4 x i16>* [[TMP10]], align 2, !alias.scope !26
+; UNROLL-NO-IC-NEXT: [[WIDE_LOAD7]] = load <4 x i16>, <4 x i16>* [[TMP10]], align 2
; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = shufflevector <4 x i16> [[WIDE_LOAD]], <4 x i16> [[WIDE_LOAD7]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = sext <4 x i16> [[TMP11]] to <4 x i32>
; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP2]]
; UNROLL-NO-IC-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP19]], i32 0
; UNROLL-NO-IC-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
-; UNROLL-NO-IC-NEXT: store <4 x i32> [[TMP17]], <4 x i32>* [[TMP22]], align 4, !alias.scope !29, !noalias !26
+; UNROLL-NO-IC-NEXT: store <4 x i32> [[TMP17]], <4 x i32>* [[TMP22]], align 4
; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[TMP19]], i32 4
; UNROLL-NO-IC-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
-; UNROLL-NO-IC-NEXT: store <4 x i32> [[TMP18]], <4 x i32>* [[TMP24]], align 4, !alias.scope !29, !noalias !26
+; UNROLL-NO-IC-NEXT: store <4 x i32> [[TMP18]], <4 x i32>* [[TMP24]], align 4
; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; UNROLL-NO-IC-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; UNROLL-NO-IC-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]]
; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[INDUCTION7]], 1
; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP1]]
; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP2]]
-; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = load i16, i16* [[TMP3]], align 2, !alias.scope !25
-; UNROLL-NO-VF-NEXT: [[TMP6]] = load i16, i16* [[TMP4]], align 2, !alias.scope !25
+; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = load i16, i16* [[TMP3]], align 2
+; UNROLL-NO-VF-NEXT: [[TMP6]] = load i16, i16* [[TMP4]], align 2
; UNROLL-NO-VF-NEXT: [[TMP7:%.*]] = sext i16 [[VECTOR_RECUR]] to i32
; UNROLL-NO-VF-NEXT: [[TMP8:%.*]] = sext i16 [[TMP5]] to i32
; UNROLL-NO-VF-NEXT: [[TMP9:%.*]] = sext i16 [[TMP5]] to i32
; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = mul nsw i32 [[TMP10]], [[TMP8]]
; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDUCTION]]
; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDUCTION7]]
-; UNROLL-NO-VF-NEXT: store i32 [[TMP11]], i32* [[TMP13]], align 4, !alias.scope !28, !noalias !25
-; UNROLL-NO-VF-NEXT: store i32 [[TMP12]], i32* [[TMP14]], align 4, !alias.scope !28, !noalias !25
+; UNROLL-NO-VF-NEXT: store i32 [[TMP11]], i32* [[TMP13]], align 4
+; UNROLL-NO-VF-NEXT: store i32 [[TMP12]], i32* [[TMP14]], align 4
; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; UNROLL-NO-VF-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; UNROLL-NO-VF-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
; SINK-AFTER-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP2]]
; SINK-AFTER-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[TMP3]], i32 0
; SINK-AFTER-NEXT: [[TMP5:%.*]] = bitcast i16* [[TMP4]] to <4 x i16>*
-; SINK-AFTER-NEXT: [[WIDE_LOAD]] = load <4 x i16>, <4 x i16>* [[TMP5]], align 2, !alias.scope !26
+; SINK-AFTER-NEXT: [[WIDE_LOAD]] = load <4 x i16>, <4 x i16>* [[TMP5]], align 2
; SINK-AFTER-NEXT: [[TMP6:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; SINK-AFTER-NEXT: [[TMP7:%.*]] = sext <4 x i16> [[TMP6]] to <4 x i32>
; SINK-AFTER-NEXT: [[TMP8:%.*]] = sext <4 x i16> [[WIDE_LOAD]] to <4 x i32>
; SINK-AFTER-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]]
; SINK-AFTER-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0
; SINK-AFTER-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
-; SINK-AFTER-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* [[TMP12]], align 4, !alias.scope !29, !noalias !26
+; SINK-AFTER-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* [[TMP12]], align 4
; SINK-AFTER-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; SINK-AFTER-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; SINK-AFTER-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]]
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x i16], [2 x i16]* [[A]], i64 [[TMP5]], i64 1
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i16], [2 x i16]* [[A]], i64 [[TMP6]], i64 1
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
-; CHECK-NEXT: store <4 x i32> <i32 7, i32 7, i32 7, i32 7>, <4 x i32>* [[TMP12]], align 4, !alias.scope !33, !noalias !36
-; CHECK-NEXT: [[TMP13:%.*]] = load i16, i16* [[TMP8]], align 2, !alias.scope !39
-; CHECK-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP9]], align 2, !alias.scope !39
-; CHECK-NEXT: [[TMP15:%.*]] = load i16, i16* [[TMP10]], align 2, !alias.scope !39
-; CHECK-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP11]], align 2, !alias.scope !39
+; CHECK-NEXT: store <4 x i32> <i32 7, i32 7, i32 7, i32 7>, <4 x i32>* [[TMP12]], align 4
+; CHECK-NEXT: [[TMP13:%.*]] = load i16, i16* [[TMP8]], align 2
+; CHECK-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP9]], align 2
+; CHECK-NEXT: [[TMP15:%.*]] = load i16, i16* [[TMP10]], align 2
+; CHECK-NEXT: [[TMP16:%.*]] = load i16, i16* [[TMP11]], align 2
; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i16> poison, i16 [[TMP13]], i64 0
; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x i16> [[TMP17]], i16 [[TMP14]], i64 1
; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i16> [[TMP18]], i16 [[TMP15]], i64 2
; CHECK-NEXT: [[TMP24:%.*]] = mul nsw <4 x i32> [[TMP23]], [[TMP22]]
; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP26:%.*]] = bitcast i32* [[TMP25]] to <4 x i32>*
-; CHECK-NEXT: store <4 x i32> [[TMP24]], <4 x i32>* [[TMP26]], align 4, !alias.scope !40, !noalias !39
+; CHECK-NEXT: store <4 x i32> [[TMP24]], <4 x i32>* [[TMP26]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]]
; UNROLL-NEXT: [[TMP18:%.*]] = getelementptr inbounds [2 x i16], [2 x i16]* [[A]], i64 [[TMP9]], i64 1
; UNROLL-NEXT: [[TMP19:%.*]] = getelementptr inbounds [2 x i16], [2 x i16]* [[A]], i64 [[TMP10]], i64 1
; UNROLL-NEXT: [[TMP20:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
-; UNROLL-NEXT: store <4 x i32> <i32 7, i32 7, i32 7, i32 7>, <4 x i32>* [[TMP20]], align 4, !alias.scope !33, !noalias !36
+; UNROLL-NEXT: store <4 x i32> <i32 7, i32 7, i32 7, i32 7>, <4 x i32>* [[TMP20]], align 4
; UNROLL-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i64 4
; UNROLL-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
-; UNROLL-NEXT: store <4 x i32> <i32 7, i32 7, i32 7, i32 7>, <4 x i32>* [[TMP22]], align 4, !alias.scope !33, !noalias !36
-; UNROLL-NEXT: [[TMP23:%.*]] = load i16, i16* [[TMP12]], align 2, !alias.scope !39
-; UNROLL-NEXT: [[TMP24:%.*]] = load i16, i16* [[TMP13]], align 2, !alias.scope !39
-; UNROLL-NEXT: [[TMP25:%.*]] = load i16, i16* [[TMP14]], align 2, !alias.scope !39
-; UNROLL-NEXT: [[TMP26:%.*]] = load i16, i16* [[TMP15]], align 2, !alias.scope !39
+; UNROLL-NEXT: store <4 x i32> <i32 7, i32 7, i32 7, i32 7>, <4 x i32>* [[TMP22]], align 4
+; UNROLL-NEXT: [[TMP23:%.*]] = load i16, i16* [[TMP12]], align 2
+; UNROLL-NEXT: [[TMP24:%.*]] = load i16, i16* [[TMP13]], align 2
+; UNROLL-NEXT: [[TMP25:%.*]] = load i16, i16* [[TMP14]], align 2
+; UNROLL-NEXT: [[TMP26:%.*]] = load i16, i16* [[TMP15]], align 2
; UNROLL-NEXT: [[TMP27:%.*]] = insertelement <4 x i16> poison, i16 [[TMP23]], i64 0
; UNROLL-NEXT: [[TMP28:%.*]] = insertelement <4 x i16> [[TMP27]], i16 [[TMP24]], i64 1
; UNROLL-NEXT: [[TMP29:%.*]] = insertelement <4 x i16> [[TMP28]], i16 [[TMP25]], i64 2
; UNROLL-NEXT: [[TMP30:%.*]] = insertelement <4 x i16> [[TMP29]], i16 [[TMP26]], i64 3
-; UNROLL-NEXT: [[TMP31:%.*]] = load i16, i16* [[TMP16]], align 2, !alias.scope !39
-; UNROLL-NEXT: [[TMP32:%.*]] = load i16, i16* [[TMP17]], align 2, !alias.scope !39
-; UNROLL-NEXT: [[TMP33:%.*]] = load i16, i16* [[TMP18]], align 2, !alias.scope !39
-; UNROLL-NEXT: [[TMP34:%.*]] = load i16, i16* [[TMP19]], align 2, !alias.scope !39
+; UNROLL-NEXT: [[TMP31:%.*]] = load i16, i16* [[TMP16]], align 2
+; UNROLL-NEXT: [[TMP32:%.*]] = load i16, i16* [[TMP17]], align 2
+; UNROLL-NEXT: [[TMP33:%.*]] = load i16, i16* [[TMP18]], align 2
+; UNROLL-NEXT: [[TMP34:%.*]] = load i16, i16* [[TMP19]], align 2
; UNROLL-NEXT: [[TMP35:%.*]] = insertelement <4 x i16> poison, i16 [[TMP31]], i64 0
; UNROLL-NEXT: [[TMP36:%.*]] = insertelement <4 x i16> [[TMP35]], i16 [[TMP32]], i64 1
; UNROLL-NEXT: [[TMP37:%.*]] = insertelement <4 x i16> [[TMP36]], i16 [[TMP33]], i64 2
; UNROLL-NEXT: [[TMP46:%.*]] = mul nsw <4 x i32> [[TMP44]], [[TMP42]]
; UNROLL-NEXT: [[TMP47:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
; UNROLL-NEXT: [[TMP48:%.*]] = bitcast i32* [[TMP47]] to <4 x i32>*
-; UNROLL-NEXT: store <4 x i32> [[TMP45]], <4 x i32>* [[TMP48]], align 4, !alias.scope !40, !noalias !39
+; UNROLL-NEXT: store <4 x i32> [[TMP45]], <4 x i32>* [[TMP48]], align 4
; UNROLL-NEXT: [[TMP49:%.*]] = getelementptr inbounds i32, i32* [[TMP47]], i64 4
; UNROLL-NEXT: [[TMP50:%.*]] = bitcast i32* [[TMP49]] to <4 x i32>*
-; UNROLL-NEXT: store <4 x i32> [[TMP46]], <4 x i32>* [[TMP50]], align 4, !alias.scope !40, !noalias !39
+; UNROLL-NEXT: store <4 x i32> [[TMP46]], <4 x i32>* [[TMP50]], align 4
; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; UNROLL-NEXT: [[TMP51:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; UNROLL-NEXT: br i1 [[TMP51]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]]
; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = getelementptr inbounds [2 x i16], [2 x i16]* [[A]], i64 [[TMP7]], i64 1
; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
-; UNROLL-NO-IC-NEXT: store <4 x i32> <i32 7, i32 7, i32 7, i32 7>, <4 x i32>* [[TMP19]], align 4, !alias.scope !33, !noalias !36
+; UNROLL-NO-IC-NEXT: store <4 x i32> <i32 7, i32 7, i32 7, i32 7>, <4 x i32>* [[TMP19]], align 4
; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 4
; UNROLL-NO-IC-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <4 x i32>*
-; UNROLL-NO-IC-NEXT: store <4 x i32> <i32 7, i32 7, i32 7, i32 7>, <4 x i32>* [[TMP21]], align 4, !alias.scope !33, !noalias !36
-; UNROLL-NO-IC-NEXT: [[TMP22:%.*]] = load i16, i16* [[TMP10]], align 2, !alias.scope !39
-; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = load i16, i16* [[TMP11]], align 2, !alias.scope !39
-; UNROLL-NO-IC-NEXT: [[TMP24:%.*]] = load i16, i16* [[TMP12]], align 2, !alias.scope !39
-; UNROLL-NO-IC-NEXT: [[TMP25:%.*]] = load i16, i16* [[TMP13]], align 2, !alias.scope !39
+; UNROLL-NO-IC-NEXT: store <4 x i32> <i32 7, i32 7, i32 7, i32 7>, <4 x i32>* [[TMP21]], align 4
+; UNROLL-NO-IC-NEXT: [[TMP22:%.*]] = load i16, i16* [[TMP10]], align 2
+; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = load i16, i16* [[TMP11]], align 2
+; UNROLL-NO-IC-NEXT: [[TMP24:%.*]] = load i16, i16* [[TMP12]], align 2
+; UNROLL-NO-IC-NEXT: [[TMP25:%.*]] = load i16, i16* [[TMP13]], align 2
; UNROLL-NO-IC-NEXT: [[TMP26:%.*]] = insertelement <4 x i16> poison, i16 [[TMP22]], i32 0
; UNROLL-NO-IC-NEXT: [[TMP27:%.*]] = insertelement <4 x i16> [[TMP26]], i16 [[TMP23]], i32 1
; UNROLL-NO-IC-NEXT: [[TMP28:%.*]] = insertelement <4 x i16> [[TMP27]], i16 [[TMP24]], i32 2
; UNROLL-NO-IC-NEXT: [[TMP29:%.*]] = insertelement <4 x i16> [[TMP28]], i16 [[TMP25]], i32 3
-; UNROLL-NO-IC-NEXT: [[TMP30:%.*]] = load i16, i16* [[TMP14]], align 2, !alias.scope !39
-; UNROLL-NO-IC-NEXT: [[TMP31:%.*]] = load i16, i16* [[TMP15]], align 2, !alias.scope !39
-; UNROLL-NO-IC-NEXT: [[TMP32:%.*]] = load i16, i16* [[TMP16]], align 2, !alias.scope !39
-; UNROLL-NO-IC-NEXT: [[TMP33:%.*]] = load i16, i16* [[TMP17]], align 2, !alias.scope !39
+; UNROLL-NO-IC-NEXT: [[TMP30:%.*]] = load i16, i16* [[TMP14]], align 2
+; UNROLL-NO-IC-NEXT: [[TMP31:%.*]] = load i16, i16* [[TMP15]], align 2
+; UNROLL-NO-IC-NEXT: [[TMP32:%.*]] = load i16, i16* [[TMP16]], align 2
+; UNROLL-NO-IC-NEXT: [[TMP33:%.*]] = load i16, i16* [[TMP17]], align 2
; UNROLL-NO-IC-NEXT: [[TMP34:%.*]] = insertelement <4 x i16> poison, i16 [[TMP30]], i32 0
; UNROLL-NO-IC-NEXT: [[TMP35:%.*]] = insertelement <4 x i16> [[TMP34]], i16 [[TMP31]], i32 1
; UNROLL-NO-IC-NEXT: [[TMP36:%.*]] = insertelement <4 x i16> [[TMP35]], i16 [[TMP32]], i32 2
; UNROLL-NO-IC-NEXT: [[TMP47:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP4]]
; UNROLL-NO-IC-NEXT: [[TMP48:%.*]] = getelementptr inbounds i32, i32* [[TMP46]], i32 0
; UNROLL-NO-IC-NEXT: [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <4 x i32>*
-; UNROLL-NO-IC-NEXT: store <4 x i32> [[TMP44]], <4 x i32>* [[TMP49]], align 4, !alias.scope !40, !noalias !39
+; UNROLL-NO-IC-NEXT: store <4 x i32> [[TMP44]], <4 x i32>* [[TMP49]], align 4
; UNROLL-NO-IC-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TMP46]], i32 4
; UNROLL-NO-IC-NEXT: [[TMP51:%.*]] = bitcast i32* [[TMP50]] to <4 x i32>*
-; UNROLL-NO-IC-NEXT: store <4 x i32> [[TMP45]], <4 x i32>* [[TMP51]], align 4, !alias.scope !40, !noalias !39
+; UNROLL-NO-IC-NEXT: store <4 x i32> [[TMP45]], <4 x i32>* [[TMP51]], align 4
; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; UNROLL-NO-IC-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; UNROLL-NO-IC-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]]
; UNROLL-NO-VF-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDUCTION17]]
; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i16], [2 x i16]* [[A]], i64 [[INDUCTION]], i64 1
; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = getelementptr inbounds [2 x i16], [2 x i16]* [[A]], i64 [[INDUCTION17]], i64 1
-; UNROLL-NO-VF-NEXT: store i32 7, i32* [[TMP0]], align 4, !alias.scope !32, !noalias !35
-; UNROLL-NO-VF-NEXT: store i32 7, i32* [[TMP1]], align 4, !alias.scope !32, !noalias !35
-; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP2]], align 2, !alias.scope !38
-; UNROLL-NO-VF-NEXT: [[TMP5]] = load i16, i16* [[TMP3]], align 2, !alias.scope !38
+; UNROLL-NO-VF-NEXT: store i32 7, i32* [[TMP0]], align 4
+; UNROLL-NO-VF-NEXT: store i32 7, i32* [[TMP1]], align 4
+; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = load i16, i16* [[TMP2]], align 2
+; UNROLL-NO-VF-NEXT: [[TMP5]] = load i16, i16* [[TMP3]], align 2
; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = sext i16 [[VECTOR_RECUR]] to i32
; UNROLL-NO-VF-NEXT: [[TMP7:%.*]] = sext i16 [[TMP4]] to i32
; UNROLL-NO-VF-NEXT: [[TMP8:%.*]] = sext i16 [[TMP4]] to i32
; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = mul nsw i32 [[TMP9]], [[TMP7]]
; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDUCTION]]
; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDUCTION17]]
-; UNROLL-NO-VF-NEXT: store i32 [[TMP10]], i32* [[TMP12]], align 4, !alias.scope !39, !noalias !38
-; UNROLL-NO-VF-NEXT: store i32 [[TMP11]], i32* [[TMP13]], align 4, !alias.scope !39, !noalias !38
+; UNROLL-NO-VF-NEXT: store i32 [[TMP10]], i32* [[TMP12]], align 4
+; UNROLL-NO-VF-NEXT: store i32 [[TMP11]], i32* [[TMP13]], align 4
; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; UNROLL-NO-VF-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]]
; SINK-AFTER-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i16], [2 x i16]* [[A]], i64 [[TMP3]], i64 1
; SINK-AFTER-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
; SINK-AFTER-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
-; SINK-AFTER-NEXT: store <4 x i32> <i32 7, i32 7, i32 7, i32 7>, <4 x i32>* [[TMP10]], align 4, !alias.scope !33, !noalias !36
-; SINK-AFTER-NEXT: [[TMP11:%.*]] = load i16, i16* [[TMP5]], align 2, !alias.scope !39
-; SINK-AFTER-NEXT: [[TMP12:%.*]] = load i16, i16* [[TMP6]], align 2, !alias.scope !39
-; SINK-AFTER-NEXT: [[TMP13:%.*]] = load i16, i16* [[TMP7]], align 2, !alias.scope !39
-; SINK-AFTER-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP8]], align 2, !alias.scope !39
+; SINK-AFTER-NEXT: store <4 x i32> <i32 7, i32 7, i32 7, i32 7>, <4 x i32>* [[TMP10]], align 4
+; SINK-AFTER-NEXT: [[TMP11:%.*]] = load i16, i16* [[TMP5]], align 2
+; SINK-AFTER-NEXT: [[TMP12:%.*]] = load i16, i16* [[TMP6]], align 2
+; SINK-AFTER-NEXT: [[TMP13:%.*]] = load i16, i16* [[TMP7]], align 2
+; SINK-AFTER-NEXT: [[TMP14:%.*]] = load i16, i16* [[TMP8]], align 2
; SINK-AFTER-NEXT: [[TMP15:%.*]] = insertelement <4 x i16> poison, i16 [[TMP11]], i32 0
; SINK-AFTER-NEXT: [[TMP16:%.*]] = insertelement <4 x i16> [[TMP15]], i16 [[TMP12]], i32 1
; SINK-AFTER-NEXT: [[TMP17:%.*]] = insertelement <4 x i16> [[TMP16]], i16 [[TMP13]], i32 2
; SINK-AFTER-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP0]]
; SINK-AFTER-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP23]], i32 0
; SINK-AFTER-NEXT: [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <4 x i32>*
-; SINK-AFTER-NEXT: store <4 x i32> [[TMP22]], <4 x i32>* [[TMP25]], align 4, !alias.scope !40, !noalias !39
+; SINK-AFTER-NEXT: store <4 x i32> [[TMP22]], <4 x i32>* [[TMP25]], align 4
; SINK-AFTER-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; SINK-AFTER-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; SINK-AFTER-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]]
; CHECK-NEXT: [[TMP3:%.*]] = or i64 [[INDEX]], 1
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP3]]
; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16* [[TMP4]] to <4 x i16>*
-; CHECK-NEXT: [[WIDE_LOAD]] = load <4 x i16>, <4 x i16>* [[TMP5]], align 2, !alias.scope !43
+; CHECK-NEXT: [[WIDE_LOAD]] = load <4 x i16>, <4 x i16>* [[TMP5]], align 2
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; CHECK-NEXT: [[TMP7:%.*]] = sext <4 x i16> [[TMP6]] to <4 x i32>
; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[TMP7]], <i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: [[TMP10:%.*]] = mul nsw <4 x i32> [[TMP8]], [[TMP9]]
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
-; CHECK-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4, !alias.scope !46, !noalias !43
+; CHECK-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP48:![0-9]+]]
; UNROLL-NEXT: [[TMP3:%.*]] = or i64 [[INDEX]], 1
; UNROLL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP3]]
; UNROLL-NEXT: [[TMP5:%.*]] = bitcast i16* [[TMP4]] to <4 x i16>*
-; UNROLL-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP5]], align 2, !alias.scope !43
+; UNROLL-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP5]], align 2
; UNROLL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP4]], i64 4
; UNROLL-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <4 x i16>*
-; UNROLL-NEXT: [[WIDE_LOAD7]] = load <4 x i16>, <4 x i16>* [[TMP7]], align 2, !alias.scope !43
+; UNROLL-NEXT: [[WIDE_LOAD7]] = load <4 x i16>, <4 x i16>* [[TMP7]], align 2
; UNROLL-NEXT: [[TMP8:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; UNROLL-NEXT: [[TMP9:%.*]] = shufflevector <4 x i16> [[WIDE_LOAD]], <4 x i16> [[WIDE_LOAD7]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; UNROLL-NEXT: [[TMP10:%.*]] = sext <4 x i16> [[TMP8]] to <4 x i32>
; UNROLL-NEXT: [[TMP17:%.*]] = mul nsw <4 x i32> [[TMP13]], [[TMP15]]
; UNROLL-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
; UNROLL-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
-; UNROLL-NEXT: store <4 x i32> [[TMP16]], <4 x i32>* [[TMP19]], align 4, !alias.scope !46, !noalias !43
+; UNROLL-NEXT: store <4 x i32> [[TMP16]], <4 x i32>* [[TMP19]], align 4
; UNROLL-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP18]], i64 4
; UNROLL-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <4 x i32>*
-; UNROLL-NEXT: store <4 x i32> [[TMP17]], <4 x i32>* [[TMP21]], align 4, !alias.scope !46, !noalias !43
+; UNROLL-NEXT: store <4 x i32> [[TMP17]], <4 x i32>* [[TMP21]], align 4
; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; UNROLL-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; UNROLL-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP48:![0-9]+]]
; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP4]]
; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 0
; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = bitcast i16* [[TMP7]] to <4 x i16>*
-; UNROLL-NO-IC-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP8]], align 2, !alias.scope !43
+; UNROLL-NO-IC-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, <4 x i16>* [[TMP8]], align 2
; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 4
; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = bitcast i16* [[TMP9]] to <4 x i16>*
-; UNROLL-NO-IC-NEXT: [[WIDE_LOAD7]] = load <4 x i16>, <4 x i16>* [[TMP10]], align 2, !alias.scope !43
+; UNROLL-NO-IC-NEXT: [[WIDE_LOAD7]] = load <4 x i16>, <4 x i16>* [[TMP10]], align 2
; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = shufflevector <4 x i16> [[WIDE_LOAD]], <4 x i16> [[WIDE_LOAD7]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = sext <4 x i16> [[TMP11]] to <4 x i32>
; UNROLL-NO-IC-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP2]]
; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[TMP21]], i32 0
; UNROLL-NO-IC-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
-; UNROLL-NO-IC-NEXT: store <4 x i32> [[TMP19]], <4 x i32>* [[TMP24]], align 4, !alias.scope !46, !noalias !43
+; UNROLL-NO-IC-NEXT: store <4 x i32> [[TMP19]], <4 x i32>* [[TMP24]], align 4
; UNROLL-NO-IC-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP21]], i32 4
; UNROLL-NO-IC-NEXT: [[TMP26:%.*]] = bitcast i32* [[TMP25]] to <4 x i32>*
-; UNROLL-NO-IC-NEXT: store <4 x i32> [[TMP20]], <4 x i32>* [[TMP26]], align 4, !alias.scope !46, !noalias !43
+; UNROLL-NO-IC-NEXT: store <4 x i32> [[TMP20]], <4 x i32>* [[TMP26]], align 4
; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; UNROLL-NO-IC-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; UNROLL-NO-IC-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP48:![0-9]+]]
; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[INDUCTION7]], 1
; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP1]]
; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP2]]
-; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = load i16, i16* [[TMP3]], align 2, !alias.scope !42
-; UNROLL-NO-VF-NEXT: [[TMP6]] = load i16, i16* [[TMP4]], align 2, !alias.scope !42
+; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = load i16, i16* [[TMP3]], align 2
+; UNROLL-NO-VF-NEXT: [[TMP6]] = load i16, i16* [[TMP4]], align 2
; UNROLL-NO-VF-NEXT: [[TMP7:%.*]] = sext i16 [[VECTOR_RECUR]] to i32
; UNROLL-NO-VF-NEXT: [[TMP8:%.*]] = sext i16 [[TMP5]] to i32
; UNROLL-NO-VF-NEXT: [[TMP9:%.*]] = add nsw i32 [[TMP7]], 2
; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = mul nsw i32 [[TMP10]], [[TMP12]]
; UNROLL-NO-VF-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDUCTION]]
; UNROLL-NO-VF-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDUCTION7]]
-; UNROLL-NO-VF-NEXT: store i32 [[TMP13]], i32* [[TMP15]], align 4, !alias.scope !45, !noalias !42
-; UNROLL-NO-VF-NEXT: store i32 [[TMP14]], i32* [[TMP16]], align 4, !alias.scope !45, !noalias !42
+; UNROLL-NO-VF-NEXT: store i32 [[TMP13]], i32* [[TMP15]], align 4
+; UNROLL-NO-VF-NEXT: store i32 [[TMP14]], i32* [[TMP16]], align 4
; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; UNROLL-NO-VF-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; UNROLL-NO-VF-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP47:![0-9]+]]
; SINK-AFTER-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, i16* [[A]], i64 [[TMP2]]
; SINK-AFTER-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[TMP3]], i32 0
; SINK-AFTER-NEXT: [[TMP5:%.*]] = bitcast i16* [[TMP4]] to <4 x i16>*
-; SINK-AFTER-NEXT: [[WIDE_LOAD]] = load <4 x i16>, <4 x i16>* [[TMP5]], align 2, !alias.scope !43
+; SINK-AFTER-NEXT: [[WIDE_LOAD]] = load <4 x i16>, <4 x i16>* [[TMP5]], align 2
; SINK-AFTER-NEXT: [[TMP6:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; SINK-AFTER-NEXT: [[TMP7:%.*]] = sext <4 x i16> [[TMP6]] to <4 x i32>
; SINK-AFTER-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[TMP7]], <i32 2, i32 2, i32 2, i32 2>
; SINK-AFTER-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]]
; SINK-AFTER-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0
; SINK-AFTER-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>*
-; SINK-AFTER-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP13]], align 4, !alias.scope !46, !noalias !43
+; SINK-AFTER-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP13]], align 4
; SINK-AFTER-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; SINK-AFTER-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; SINK-AFTER-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP48:![0-9]+]]
; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> [[TMP7]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], <i16 4, i16 4, i16 4, i16 4>
-; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], 40
-; UNROLL-NO-IC-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP50:![0-9]+]]
+; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], 40
+; UNROLL-NO-IC-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP50:![0-9]+]]
; UNROLL-NO-IC: middle.block:
; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i32 43, 40
; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP7]], i32 3
; UNROLL-NO-VF: vector.body:
; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR:%.*]] = phi i16 [ 0, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR1:%.*]] = phi i32 [ -27, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR2:%.*]] = phi i32 [ -27, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
; UNROLL-NO-VF-NEXT: [[TMP0:%.*]] = trunc i32 [[INDEX]] to i16
; UNROLL-NO-VF-NEXT: [[OFFSET_IDX:%.*]] = add i16 -27, [[TMP0]]
; UNROLL-NO-VF-NEXT: [[INDUCTION:%.*]] = add i16 [[OFFSET_IDX]], 0
-; UNROLL-NO-VF-NEXT: [[INDUCTION2:%.*]] = add i16 [[OFFSET_IDX]], 1
+; UNROLL-NO-VF-NEXT: [[INDUCTION1:%.*]] = add i16 [[OFFSET_IDX]], 1
; UNROLL-NO-VF-NEXT: [[TMP1:%.*]] = add i16 [[INDUCTION]], 1
-; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = add i16 [[INDUCTION2]], 1
+; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = add i16 [[INDUCTION1]], 1
; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = zext i16 [[TMP1]] to i32
; UNROLL-NO-VF-NEXT: [[TMP4]] = zext i16 [[TMP2]] to i32
; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = add i16 [[TMP1]], 5
; UNROLL-NO-VF-NEXT: [[TMP6]] = add i16 [[TMP2]], 5
; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; UNROLL-NO-VF-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 42
-; UNROLL-NO-VF-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP49:![0-9]+]]
+; UNROLL-NO-VF-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 42
+; UNROLL-NO-VF-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP49:![0-9]+]]
; UNROLL-NO-VF: middle.block:
; UNROLL-NO-VF-NEXT: [[CMP_N:%.*]] = icmp eq i32 43, 42
; UNROLL-NO-VF-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
; SINK-AFTER-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; SINK-AFTER-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; SINK-AFTER-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], <i16 4, i16 4, i16 4, i16 4>
-; SINK-AFTER-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 40
-; SINK-AFTER-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP50:![0-9]+]]
+; SINK-AFTER-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 40
+; SINK-AFTER-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP50:![0-9]+]]
; SINK-AFTER: middle.block:
; SINK-AFTER-NEXT: [[CMP_N:%.*]] = icmp eq i32 43, 40
; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[TMP3]], i32 3
; UNROLL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
; UNROLL-NEXT: br label [[VECTOR_BODY:%.*]]
; UNROLL: vector.body:
-; UNROLL-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE18:%.*]] ]
-; UNROLL-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP42:%.*]], [[PRED_UDIV_CONTINUE18]] ]
-; UNROLL-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP45:%.*]], [[PRED_UDIV_CONTINUE18]] ]
-; UNROLL-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP46:%.*]], [[PRED_UDIV_CONTINUE18]] ]
+; UNROLL-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE19:%.*]] ]
+; UNROLL-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP42:%.*]], [[PRED_UDIV_CONTINUE19]] ]
+; UNROLL-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP45:%.*]], [[PRED_UDIV_CONTINUE19]] ]
+; UNROLL-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP46:%.*]], [[PRED_UDIV_CONTINUE19]] ]
; UNROLL-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[Y]], [[INDEX]]
-; UNROLL-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[INDEX]], i64 0
-; UNROLL-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT2]], <4 x i32> poison, <4 x i32> zeroinitializer
-; UNROLL-NEXT: [[VEC_IV:%.*]] = or <4 x i32> [[BROADCAST_SPLAT3]], <i32 0, i32 1, i32 2, i32 3>
-; UNROLL-NEXT: [[VEC_IV4:%.*]] = or <4 x i32> [[BROADCAST_SPLAT3]], <i32 4, i32 5, i32 6, i32 7>
+; UNROLL-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i32> poison, i32 [[INDEX]], i64 0
+; UNROLL-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT3]], <4 x i32> poison, <4 x i32> zeroinitializer
+; UNROLL-NEXT: [[VEC_IV:%.*]] = or <4 x i32> [[BROADCAST_SPLAT4]], <i32 0, i32 1, i32 2, i32 3>
+; UNROLL-NEXT: [[VEC_IV5:%.*]] = or <4 x i32> [[BROADCAST_SPLAT4]], <i32 4, i32 5, i32 6, i32 7>
; UNROLL-NEXT: [[TMP2:%.*]] = icmp ule <4 x i32> [[VEC_IV]], [[BROADCAST_SPLAT]]
-; UNROLL-NEXT: [[TMP3:%.*]] = icmp ule <4 x i32> [[VEC_IV4]], [[BROADCAST_SPLAT]]
+; UNROLL-NEXT: [[TMP3:%.*]] = icmp ule <4 x i32> [[VEC_IV5]], [[BROADCAST_SPLAT]]
; UNROLL-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP2]], i64 0
; UNROLL-NEXT: br i1 [[TMP4]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
; UNROLL: pred.udiv.if:
; UNROLL: pred.udiv.continue:
; UNROLL-NEXT: [[TMP7:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP6]], [[PRED_UDIV_IF]] ]
; UNROLL-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP2]], i64 1
-; UNROLL-NEXT: br i1 [[TMP8]], label [[PRED_UDIV_IF5:%.*]], label [[PRED_UDIV_CONTINUE6:%.*]]
+; UNROLL-NEXT: br i1 [[TMP8]], label [[PRED_UDIV_IF6:%.*]], label [[PRED_UDIV_CONTINUE7:%.*]]
; UNROLL: pred.udiv.if5:
; UNROLL-NEXT: [[TMP9:%.*]] = add i32 [[OFFSET_IDX]], -1
; UNROLL-NEXT: [[TMP10:%.*]] = udiv i32 219220132, [[TMP9]]
; UNROLL-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP10]], i64 1
-; UNROLL-NEXT: br label [[PRED_UDIV_CONTINUE6]]
+; UNROLL-NEXT: br label [[PRED_UDIV_CONTINUE7]]
; UNROLL: pred.udiv.continue6:
-; UNROLL-NEXT: [[TMP12:%.*]] = phi <4 x i32> [ [[TMP7]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP11]], [[PRED_UDIV_IF5]] ]
+; UNROLL-NEXT: [[TMP12:%.*]] = phi <4 x i32> [ [[TMP7]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP11]], [[PRED_UDIV_IF6]] ]
; UNROLL-NEXT: [[TMP13:%.*]] = extractelement <4 x i1> [[TMP2]], i64 2
-; UNROLL-NEXT: br i1 [[TMP13]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8:%.*]]
+; UNROLL-NEXT: br i1 [[TMP13]], label [[PRED_UDIV_IF8:%.*]], label [[PRED_UDIV_CONTINUE9:%.*]]
; UNROLL: pred.udiv.if7:
; UNROLL-NEXT: [[TMP14:%.*]] = add i32 [[OFFSET_IDX]], -2
; UNROLL-NEXT: [[TMP15:%.*]] = udiv i32 219220132, [[TMP14]]
; UNROLL-NEXT: [[TMP16:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP15]], i64 2
-; UNROLL-NEXT: br label [[PRED_UDIV_CONTINUE8]]
+; UNROLL-NEXT: br label [[PRED_UDIV_CONTINUE9]]
; UNROLL: pred.udiv.continue8:
-; UNROLL-NEXT: [[TMP17:%.*]] = phi <4 x i32> [ [[TMP12]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP16]], [[PRED_UDIV_IF7]] ]
+; UNROLL-NEXT: [[TMP17:%.*]] = phi <4 x i32> [ [[TMP12]], [[PRED_UDIV_CONTINUE7]] ], [ [[TMP16]], [[PRED_UDIV_IF8]] ]
; UNROLL-NEXT: [[TMP18:%.*]] = extractelement <4 x i1> [[TMP2]], i64 3
-; UNROLL-NEXT: br i1 [[TMP18]], label [[PRED_UDIV_IF9:%.*]], label [[PRED_UDIV_CONTINUE10:%.*]]
+; UNROLL-NEXT: br i1 [[TMP18]], label [[PRED_UDIV_IF10:%.*]], label [[PRED_UDIV_CONTINUE11:%.*]]
; UNROLL: pred.udiv.if9:
; UNROLL-NEXT: [[TMP19:%.*]] = add i32 [[OFFSET_IDX]], -3
; UNROLL-NEXT: [[TMP20:%.*]] = udiv i32 219220132, [[TMP19]]
; UNROLL-NEXT: [[TMP21:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP20]], i64 3
-; UNROLL-NEXT: br label [[PRED_UDIV_CONTINUE10]]
+; UNROLL-NEXT: br label [[PRED_UDIV_CONTINUE11]]
; UNROLL: pred.udiv.continue10:
-; UNROLL-NEXT: [[TMP22:%.*]] = phi <4 x i32> [ [[TMP17]], [[PRED_UDIV_CONTINUE8]] ], [ [[TMP21]], [[PRED_UDIV_IF9]] ]
+; UNROLL-NEXT: [[TMP22:%.*]] = phi <4 x i32> [ [[TMP17]], [[PRED_UDIV_CONTINUE9]] ], [ [[TMP21]], [[PRED_UDIV_IF10]] ]
; UNROLL-NEXT: [[TMP23:%.*]] = extractelement <4 x i1> [[TMP3]], i64 0
-; UNROLL-NEXT: br i1 [[TMP23]], label [[PRED_UDIV_IF11:%.*]], label [[PRED_UDIV_CONTINUE12:%.*]]
+; UNROLL-NEXT: br i1 [[TMP23]], label [[PRED_UDIV_IF12:%.*]], label [[PRED_UDIV_CONTINUE13:%.*]]
; UNROLL: pred.udiv.if11:
; UNROLL-NEXT: [[TMP24:%.*]] = add i32 [[OFFSET_IDX]], -4
; UNROLL-NEXT: [[TMP25:%.*]] = udiv i32 219220132, [[TMP24]]
; UNROLL-NEXT: [[TMP26:%.*]] = insertelement <4 x i32> poison, i32 [[TMP25]], i64 0
-; UNROLL-NEXT: br label [[PRED_UDIV_CONTINUE12]]
+; UNROLL-NEXT: br label [[PRED_UDIV_CONTINUE13]]
; UNROLL: pred.udiv.continue12:
-; UNROLL-NEXT: [[TMP27:%.*]] = phi <4 x i32> [ poison, [[PRED_UDIV_CONTINUE10]] ], [ [[TMP26]], [[PRED_UDIV_IF11]] ]
+; UNROLL-NEXT: [[TMP27:%.*]] = phi <4 x i32> [ poison, [[PRED_UDIV_CONTINUE11]] ], [ [[TMP26]], [[PRED_UDIV_IF12]] ]
; UNROLL-NEXT: [[TMP28:%.*]] = extractelement <4 x i1> [[TMP3]], i64 1
-; UNROLL-NEXT: br i1 [[TMP28]], label [[PRED_UDIV_IF13:%.*]], label [[PRED_UDIV_CONTINUE14:%.*]]
+; UNROLL-NEXT: br i1 [[TMP28]], label [[PRED_UDIV_IF14:%.*]], label [[PRED_UDIV_CONTINUE15:%.*]]
; UNROLL: pred.udiv.if13:
; UNROLL-NEXT: [[TMP29:%.*]] = add i32 [[OFFSET_IDX]], -5
; UNROLL-NEXT: [[TMP30:%.*]] = udiv i32 219220132, [[TMP29]]
; UNROLL-NEXT: [[TMP31:%.*]] = insertelement <4 x i32> [[TMP27]], i32 [[TMP30]], i64 1
-; UNROLL-NEXT: br label [[PRED_UDIV_CONTINUE14]]
+; UNROLL-NEXT: br label [[PRED_UDIV_CONTINUE15]]
; UNROLL: pred.udiv.continue14:
-; UNROLL-NEXT: [[TMP32:%.*]] = phi <4 x i32> [ [[TMP27]], [[PRED_UDIV_CONTINUE12]] ], [ [[TMP31]], [[PRED_UDIV_IF13]] ]
+; UNROLL-NEXT: [[TMP32:%.*]] = phi <4 x i32> [ [[TMP27]], [[PRED_UDIV_CONTINUE13]] ], [ [[TMP31]], [[PRED_UDIV_IF14]] ]
; UNROLL-NEXT: [[TMP33:%.*]] = extractelement <4 x i1> [[TMP3]], i64 2
-; UNROLL-NEXT: br i1 [[TMP33]], label [[PRED_UDIV_IF15:%.*]], label [[PRED_UDIV_CONTINUE16:%.*]]
+; UNROLL-NEXT: br i1 [[TMP33]], label [[PRED_UDIV_IF16:%.*]], label [[PRED_UDIV_CONTINUE17:%.*]]
; UNROLL: pred.udiv.if15:
; UNROLL-NEXT: [[TMP34:%.*]] = add i32 [[OFFSET_IDX]], -6
; UNROLL-NEXT: [[TMP35:%.*]] = udiv i32 219220132, [[TMP34]]
; UNROLL-NEXT: [[TMP36:%.*]] = insertelement <4 x i32> [[TMP32]], i32 [[TMP35]], i64 2
-; UNROLL-NEXT: br label [[PRED_UDIV_CONTINUE16]]
+; UNROLL-NEXT: br label [[PRED_UDIV_CONTINUE17]]
; UNROLL: pred.udiv.continue16:
-; UNROLL-NEXT: [[TMP37:%.*]] = phi <4 x i32> [ [[TMP32]], [[PRED_UDIV_CONTINUE14]] ], [ [[TMP36]], [[PRED_UDIV_IF15]] ]
+; UNROLL-NEXT: [[TMP37:%.*]] = phi <4 x i32> [ [[TMP32]], [[PRED_UDIV_CONTINUE15]] ], [ [[TMP36]], [[PRED_UDIV_IF16]] ]
; UNROLL-NEXT: [[TMP38:%.*]] = extractelement <4 x i1> [[TMP3]], i64 3
-; UNROLL-NEXT: br i1 [[TMP38]], label [[PRED_UDIV_IF17:%.*]], label [[PRED_UDIV_CONTINUE18]]
+; UNROLL-NEXT: br i1 [[TMP38]], label [[PRED_UDIV_IF18:%.*]], label [[PRED_UDIV_CONTINUE19]]
; UNROLL: pred.udiv.if17:
; UNROLL-NEXT: [[TMP39:%.*]] = add i32 [[OFFSET_IDX]], -7
; UNROLL-NEXT: [[TMP40:%.*]] = udiv i32 219220132, [[TMP39]]
; UNROLL-NEXT: [[TMP41:%.*]] = insertelement <4 x i32> [[TMP37]], i32 [[TMP40]], i64 3
-; UNROLL-NEXT: br label [[PRED_UDIV_CONTINUE18]]
+; UNROLL-NEXT: br label [[PRED_UDIV_CONTINUE19]]
; UNROLL: pred.udiv.continue18:
-; UNROLL-NEXT: [[TMP42]] = phi <4 x i32> [ [[TMP37]], [[PRED_UDIV_CONTINUE16]] ], [ [[TMP41]], [[PRED_UDIV_IF17]] ]
+; UNROLL-NEXT: [[TMP42]] = phi <4 x i32> [ [[TMP37]], [[PRED_UDIV_CONTINUE17]] ], [ [[TMP41]], [[PRED_UDIV_IF18]] ]
; UNROLL-NEXT: [[TMP43:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP22]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; UNROLL-NEXT: [[TMP44:%.*]] = shufflevector <4 x i32> [[TMP22]], <4 x i32> [[TMP42]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; UNROLL-NEXT: [[TMP45]] = add <4 x i32> [[VEC_PHI]], [[TMP43]]
-; UNROLL-NEXT: [[TMP46]] = add <4 x i32> [[VEC_PHI1]], [[TMP44]]
+; UNROLL-NEXT: [[TMP46]] = add <4 x i32> [[VEC_PHI2]], [[TMP44]]
; UNROLL-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8
; UNROLL-NEXT: [[TMP47:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; UNROLL-NEXT: br i1 [[TMP47]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF52:![0-9]+]], !llvm.loop [[LOOP53:![0-9]+]]
; UNROLL: middle.block:
-; UNROLL-NEXT: [[TMP48:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> [[TMP46]], <4 x i32> [[VEC_PHI1]]
+; UNROLL-NEXT: [[TMP48:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> [[TMP46]], <4 x i32> [[VEC_PHI2]]
; UNROLL-NEXT: [[TMP49:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP45]], <4 x i32> [[VEC_PHI]]
; UNROLL-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP48]], [[TMP49]]
; UNROLL-NEXT: [[TMP50:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
; UNROLL-NO-IC-NEXT: br label [[VECTOR_BODY:%.*]]
; UNROLL-NO-IC: vector.body:
-; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE18:%.*]] ]
-; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP43:%.*]], [[PRED_UDIV_CONTINUE18]] ]
-; UNROLL-NO-IC-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP46:%.*]], [[PRED_UDIV_CONTINUE18]] ]
-; UNROLL-NO-IC-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP47:%.*]], [[PRED_UDIV_CONTINUE18]] ]
+; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE19:%.*]] ]
+; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP43:%.*]], [[PRED_UDIV_CONTINUE19]] ]
+; UNROLL-NO-IC-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP46:%.*]], [[PRED_UDIV_CONTINUE19]] ]
+; UNROLL-NO-IC-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP47:%.*]], [[PRED_UDIV_CONTINUE19]] ]
; UNROLL-NO-IC-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[Y]], [[INDEX]]
-; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[INDEX]], i32 0
-; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT2]], <4 x i32> poison, <4 x i32> zeroinitializer
-; UNROLL-NO-IC-NEXT: [[VEC_IV:%.*]] = add <4 x i32> [[BROADCAST_SPLAT3]], <i32 0, i32 1, i32 2, i32 3>
-; UNROLL-NO-IC-NEXT: [[VEC_IV4:%.*]] = add <4 x i32> [[BROADCAST_SPLAT3]], <i32 4, i32 5, i32 6, i32 7>
+; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i32> poison, i32 [[INDEX]], i32 0
+; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT3]], <4 x i32> poison, <4 x i32> zeroinitializer
+; UNROLL-NO-IC-NEXT: [[VEC_IV:%.*]] = add <4 x i32> [[BROADCAST_SPLAT4]], <i32 0, i32 1, i32 2, i32 3>
+; UNROLL-NO-IC-NEXT: [[VEC_IV5:%.*]] = add <4 x i32> [[BROADCAST_SPLAT4]], <i32 4, i32 5, i32 6, i32 7>
; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = icmp ule <4 x i32> [[VEC_IV]], [[BROADCAST_SPLAT]]
-; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = icmp ule <4 x i32> [[VEC_IV4]], [[BROADCAST_SPLAT]]
+; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = icmp ule <4 x i32> [[VEC_IV5]], [[BROADCAST_SPLAT]]
; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP2]], i32 0
; UNROLL-NO-IC-NEXT: br i1 [[TMP4]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
; UNROLL-NO-IC: pred.udiv.if:
; UNROLL-NO-IC: pred.udiv.continue:
; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_UDIV_IF]] ]
; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP2]], i32 1
-; UNROLL-NO-IC-NEXT: br i1 [[TMP9]], label [[PRED_UDIV_IF5:%.*]], label [[PRED_UDIV_CONTINUE6:%.*]]
+; UNROLL-NO-IC-NEXT: br i1 [[TMP9]], label [[PRED_UDIV_IF6:%.*]], label [[PRED_UDIV_CONTINUE7:%.*]]
; UNROLL-NO-IC: pred.udiv.if5:
; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = add i32 [[OFFSET_IDX]], -1
; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = udiv i32 219220132, [[TMP10]]
; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP11]], i32 1
-; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE6]]
+; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE7]]
; UNROLL-NO-IC: pred.udiv.continue6:
-; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = phi <4 x i32> [ [[TMP8]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP12]], [[PRED_UDIV_IF5]] ]
+; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = phi <4 x i32> [ [[TMP8]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP12]], [[PRED_UDIV_IF6]] ]
; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = extractelement <4 x i1> [[TMP2]], i32 2
-; UNROLL-NO-IC-NEXT: br i1 [[TMP14]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8:%.*]]
+; UNROLL-NO-IC-NEXT: br i1 [[TMP14]], label [[PRED_UDIV_IF8:%.*]], label [[PRED_UDIV_CONTINUE9:%.*]]
; UNROLL-NO-IC: pred.udiv.if7:
; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = add i32 [[OFFSET_IDX]], -2
; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = udiv i32 219220132, [[TMP15]]
; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP16]], i32 2
-; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE8]]
+; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE9]]
; UNROLL-NO-IC: pred.udiv.continue8:
-; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = phi <4 x i32> [ [[TMP13]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP17]], [[PRED_UDIV_IF7]] ]
+; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = phi <4 x i32> [ [[TMP13]], [[PRED_UDIV_CONTINUE7]] ], [ [[TMP17]], [[PRED_UDIV_IF8]] ]
; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = extractelement <4 x i1> [[TMP2]], i32 3
-; UNROLL-NO-IC-NEXT: br i1 [[TMP19]], label [[PRED_UDIV_IF9:%.*]], label [[PRED_UDIV_CONTINUE10:%.*]]
+; UNROLL-NO-IC-NEXT: br i1 [[TMP19]], label [[PRED_UDIV_IF10:%.*]], label [[PRED_UDIV_CONTINUE11:%.*]]
; UNROLL-NO-IC: pred.udiv.if9:
; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = add i32 [[OFFSET_IDX]], -3
; UNROLL-NO-IC-NEXT: [[TMP21:%.*]] = udiv i32 219220132, [[TMP20]]
; UNROLL-NO-IC-NEXT: [[TMP22:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[TMP21]], i32 3
-; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE10]]
+; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE11]]
; UNROLL-NO-IC: pred.udiv.continue10:
-; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = phi <4 x i32> [ [[TMP18]], [[PRED_UDIV_CONTINUE8]] ], [ [[TMP22]], [[PRED_UDIV_IF9]] ]
+; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = phi <4 x i32> [ [[TMP18]], [[PRED_UDIV_CONTINUE9]] ], [ [[TMP22]], [[PRED_UDIV_IF10]] ]
; UNROLL-NO-IC-NEXT: [[TMP24:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0
-; UNROLL-NO-IC-NEXT: br i1 [[TMP24]], label [[PRED_UDIV_IF11:%.*]], label [[PRED_UDIV_CONTINUE12:%.*]]
+; UNROLL-NO-IC-NEXT: br i1 [[TMP24]], label [[PRED_UDIV_IF12:%.*]], label [[PRED_UDIV_CONTINUE13:%.*]]
; UNROLL-NO-IC: pred.udiv.if11:
; UNROLL-NO-IC-NEXT: [[TMP25:%.*]] = add i32 [[OFFSET_IDX]], -4
; UNROLL-NO-IC-NEXT: [[TMP26:%.*]] = udiv i32 219220132, [[TMP25]]
; UNROLL-NO-IC-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> poison, i32 [[TMP26]], i32 0
-; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE12]]
+; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE13]]
; UNROLL-NO-IC: pred.udiv.continue12:
-; UNROLL-NO-IC-NEXT: [[TMP28:%.*]] = phi <4 x i32> [ poison, [[PRED_UDIV_CONTINUE10]] ], [ [[TMP27]], [[PRED_UDIV_IF11]] ]
+; UNROLL-NO-IC-NEXT: [[TMP28:%.*]] = phi <4 x i32> [ poison, [[PRED_UDIV_CONTINUE11]] ], [ [[TMP27]], [[PRED_UDIV_IF12]] ]
; UNROLL-NO-IC-NEXT: [[TMP29:%.*]] = extractelement <4 x i1> [[TMP3]], i32 1
-; UNROLL-NO-IC-NEXT: br i1 [[TMP29]], label [[PRED_UDIV_IF13:%.*]], label [[PRED_UDIV_CONTINUE14:%.*]]
+; UNROLL-NO-IC-NEXT: br i1 [[TMP29]], label [[PRED_UDIV_IF14:%.*]], label [[PRED_UDIV_CONTINUE15:%.*]]
; UNROLL-NO-IC: pred.udiv.if13:
; UNROLL-NO-IC-NEXT: [[TMP30:%.*]] = add i32 [[OFFSET_IDX]], -5
; UNROLL-NO-IC-NEXT: [[TMP31:%.*]] = udiv i32 219220132, [[TMP30]]
; UNROLL-NO-IC-NEXT: [[TMP32:%.*]] = insertelement <4 x i32> [[TMP28]], i32 [[TMP31]], i32 1
-; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE14]]
+; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE15]]
; UNROLL-NO-IC: pred.udiv.continue14:
-; UNROLL-NO-IC-NEXT: [[TMP33:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_UDIV_CONTINUE12]] ], [ [[TMP32]], [[PRED_UDIV_IF13]] ]
+; UNROLL-NO-IC-NEXT: [[TMP33:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_UDIV_CONTINUE13]] ], [ [[TMP32]], [[PRED_UDIV_IF14]] ]
; UNROLL-NO-IC-NEXT: [[TMP34:%.*]] = extractelement <4 x i1> [[TMP3]], i32 2
-; UNROLL-NO-IC-NEXT: br i1 [[TMP34]], label [[PRED_UDIV_IF15:%.*]], label [[PRED_UDIV_CONTINUE16:%.*]]
+; UNROLL-NO-IC-NEXT: br i1 [[TMP34]], label [[PRED_UDIV_IF16:%.*]], label [[PRED_UDIV_CONTINUE17:%.*]]
; UNROLL-NO-IC: pred.udiv.if15:
; UNROLL-NO-IC-NEXT: [[TMP35:%.*]] = add i32 [[OFFSET_IDX]], -6
; UNROLL-NO-IC-NEXT: [[TMP36:%.*]] = udiv i32 219220132, [[TMP35]]
; UNROLL-NO-IC-NEXT: [[TMP37:%.*]] = insertelement <4 x i32> [[TMP33]], i32 [[TMP36]], i32 2
-; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE16]]
+; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE17]]
; UNROLL-NO-IC: pred.udiv.continue16:
-; UNROLL-NO-IC-NEXT: [[TMP38:%.*]] = phi <4 x i32> [ [[TMP33]], [[PRED_UDIV_CONTINUE14]] ], [ [[TMP37]], [[PRED_UDIV_IF15]] ]
+; UNROLL-NO-IC-NEXT: [[TMP38:%.*]] = phi <4 x i32> [ [[TMP33]], [[PRED_UDIV_CONTINUE15]] ], [ [[TMP37]], [[PRED_UDIV_IF16]] ]
; UNROLL-NO-IC-NEXT: [[TMP39:%.*]] = extractelement <4 x i1> [[TMP3]], i32 3
-; UNROLL-NO-IC-NEXT: br i1 [[TMP39]], label [[PRED_UDIV_IF17:%.*]], label [[PRED_UDIV_CONTINUE18]]
+; UNROLL-NO-IC-NEXT: br i1 [[TMP39]], label [[PRED_UDIV_IF18:%.*]], label [[PRED_UDIV_CONTINUE19]]
; UNROLL-NO-IC: pred.udiv.if17:
; UNROLL-NO-IC-NEXT: [[TMP40:%.*]] = add i32 [[OFFSET_IDX]], -7
; UNROLL-NO-IC-NEXT: [[TMP41:%.*]] = udiv i32 219220132, [[TMP40]]
; UNROLL-NO-IC-NEXT: [[TMP42:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP41]], i32 3
-; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE18]]
+; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE19]]
; UNROLL-NO-IC: pred.udiv.continue18:
-; UNROLL-NO-IC-NEXT: [[TMP43]] = phi <4 x i32> [ [[TMP38]], [[PRED_UDIV_CONTINUE16]] ], [ [[TMP42]], [[PRED_UDIV_IF17]] ]
+; UNROLL-NO-IC-NEXT: [[TMP43]] = phi <4 x i32> [ [[TMP38]], [[PRED_UDIV_CONTINUE17]] ], [ [[TMP42]], [[PRED_UDIV_IF18]] ]
; UNROLL-NO-IC-NEXT: [[TMP44:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP23]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; UNROLL-NO-IC-NEXT: [[TMP45:%.*]] = shufflevector <4 x i32> [[TMP23]], <4 x i32> [[TMP43]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; UNROLL-NO-IC-NEXT: [[TMP46]] = add <4 x i32> [[VEC_PHI]], [[TMP44]]
-; UNROLL-NO-IC-NEXT: [[TMP47]] = add <4 x i32> [[VEC_PHI1]], [[TMP45]]
+; UNROLL-NO-IC-NEXT: [[TMP47]] = add <4 x i32> [[VEC_PHI2]], [[TMP45]]
; UNROLL-NO-IC-NEXT: [[TMP48:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP46]], <4 x i32> [[VEC_PHI]]
-; UNROLL-NO-IC-NEXT: [[TMP49:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> [[TMP47]], <4 x i32> [[VEC_PHI1]]
+; UNROLL-NO-IC-NEXT: [[TMP49:%.*]] = select <4 x i1> [[TMP3]], <4 x i32> [[TMP47]], <4 x i32> [[VEC_PHI2]]
; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8
; UNROLL-NO-IC-NEXT: [[TMP50:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; UNROLL-NO-IC-NEXT: br i1 [[TMP50]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF52:![0-9]+]], !llvm.loop [[LOOP53:![0-9]+]]
; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE5:%.*]] ]
; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[PRED_UDIV_CONTINUE5]] ]
; UNROLL-NO-VF-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[PRED_UDIV_CONTINUE5]] ]
-; UNROLL-NO-VF-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[PRED_UDIV_CONTINUE5]] ]
+; UNROLL-NO-VF-NEXT: [[VEC_PHI2:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[PRED_UDIV_CONTINUE5]] ]
; UNROLL-NO-VF-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[Y]], [[INDEX]]
; UNROLL-NO-VF-NEXT: [[VEC_IV:%.*]] = add i32 [[INDEX]], 0
; UNROLL-NO-VF-NEXT: [[VEC_IV3:%.*]] = add i32 [[INDEX]], 1
; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP4]], [[PRED_UDIV_IF]] ]
; UNROLL-NO-VF-NEXT: br i1 [[TMP3]], label [[PRED_UDIV_IF4:%.*]], label [[PRED_UDIV_CONTINUE5]]
; UNROLL-NO-VF: pred.udiv.if4:
-; UNROLL-NO-VF-NEXT: [[INDUCTION2:%.*]] = add i32 [[OFFSET_IDX]], -1
-; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = udiv i32 219220132, [[INDUCTION2]]
+; UNROLL-NO-VF-NEXT: [[INDUCTION1:%.*]] = add i32 [[OFFSET_IDX]], -1
+; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = udiv i32 219220132, [[INDUCTION1]]
; UNROLL-NO-VF-NEXT: br label [[PRED_UDIV_CONTINUE5]]
; UNROLL-NO-VF: pred.udiv.continue5:
; UNROLL-NO-VF-NEXT: [[TMP7]] = phi i32 [ poison, [[PRED_UDIV_CONTINUE]] ], [ [[TMP6]], [[PRED_UDIV_IF4]] ]
; UNROLL-NO-VF-NEXT: [[TMP8]] = add i32 [[VEC_PHI]], [[VECTOR_RECUR]]
-; UNROLL-NO-VF-NEXT: [[TMP9]] = add i32 [[VEC_PHI1]], [[TMP5]]
+; UNROLL-NO-VF-NEXT: [[TMP9]] = add i32 [[VEC_PHI2]], [[TMP5]]
; UNROLL-NO-VF-NEXT: [[TMP10:%.*]] = select i1 [[TMP2]], i32 [[TMP8]], i32 [[VEC_PHI]]
-; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = select i1 [[TMP3]], i32 [[TMP9]], i32 [[VEC_PHI1]]
+; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = select i1 [[TMP3]], i32 [[TMP9]], i32 [[VEC_PHI2]]
; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2
; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; UNROLL-NO-VF-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF51:![0-9]+]], !llvm.loop [[LOOP52:![0-9]+]]
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE13:%.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE13]] ]
-; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[PRED_STORE_CONTINUE13]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[PRED_STORE_CONTINUE13]] ]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE15:%.*]] ]
+; CHECK-NEXT: [[VEC_IND2:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[PRED_STORE_CONTINUE15]] ]
+; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[PRED_STORE_CONTINUE15]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[PRED_STORE_CONTINUE15]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[Y]], [[INDEX]]
; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[OFFSET_IDX]], -1
; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[OFFSET_IDX]], -2
; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[OFFSET_IDX]], -3
-; CHECK-NEXT: [[TMP5:%.*]] = icmp ule <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp ule <4 x i32> [[VEC_IND2]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i64 0
; CHECK-NEXT: br i1 [[TMP6]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
; CHECK: pred.udiv.if:
; CHECK: pred.udiv.continue:
; CHECK-NEXT: [[TMP9:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_UDIV_IF]] ]
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP5]], i64 1
-; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_UDIV_IF2:%.*]], label [[PRED_UDIV_CONTINUE3:%.*]]
+; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_UDIV_IF4:%.*]], label [[PRED_UDIV_CONTINUE5:%.*]]
; CHECK: pred.udiv.if2:
; CHECK-NEXT: [[TMP11:%.*]] = udiv i32 219220132, [[TMP2]]
; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP11]], i64 1
-; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE3]]
+; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE5]]
; CHECK: pred.udiv.continue3:
-; CHECK-NEXT: [[TMP13:%.*]] = phi <4 x i32> [ [[TMP9]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP12]], [[PRED_UDIV_IF2]] ]
+; CHECK-NEXT: [[TMP13:%.*]] = phi <4 x i32> [ [[TMP9]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP12]], [[PRED_UDIV_IF4]] ]
; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i1> [[TMP5]], i64 2
-; CHECK-NEXT: br i1 [[TMP14]], label [[PRED_UDIV_IF4:%.*]], label [[PRED_UDIV_CONTINUE5:%.*]]
+; CHECK-NEXT: br i1 [[TMP14]], label [[PRED_UDIV_IF6:%.*]], label [[PRED_UDIV_CONTINUE7:%.*]]
; CHECK: pred.udiv.if4:
; CHECK-NEXT: [[TMP15:%.*]] = udiv i32 219220132, [[TMP3]]
; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP15]], i64 2
-; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE5]]
+; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE7]]
; CHECK: pred.udiv.continue5:
-; CHECK-NEXT: [[TMP17:%.*]] = phi <4 x i32> [ [[TMP13]], [[PRED_UDIV_CONTINUE3]] ], [ [[TMP16]], [[PRED_UDIV_IF4]] ]
+; CHECK-NEXT: [[TMP17:%.*]] = phi <4 x i32> [ [[TMP13]], [[PRED_UDIV_CONTINUE5]] ], [ [[TMP16]], [[PRED_UDIV_IF6]] ]
; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x i1> [[TMP5]], i64 3
-; CHECK-NEXT: br i1 [[TMP18]], label [[PRED_UDIV_IF6:%.*]], label [[PRED_UDIV_CONTINUE7:%.*]]
+; CHECK-NEXT: br i1 [[TMP18]], label [[PRED_UDIV_IF8:%.*]], label [[PRED_UDIV_CONTINUE9:%.*]]
; CHECK: pred.udiv.if6:
; CHECK-NEXT: [[TMP19:%.*]] = udiv i32 219220132, [[TMP4]]
; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP19]], i64 3
-; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE7]]
+; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE9]]
; CHECK: pred.udiv.continue7:
-; CHECK-NEXT: [[TMP21]] = phi <4 x i32> [ [[TMP17]], [[PRED_UDIV_CONTINUE5]] ], [ [[TMP20]], [[PRED_UDIV_IF6]] ]
+; CHECK-NEXT: [[TMP21]] = phi <4 x i32> [ [[TMP17]], [[PRED_UDIV_CONTINUE7]] ], [ [[TMP20]], [[PRED_UDIV_IF8]] ]
; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP21]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; CHECK-NEXT: [[TMP23]] = add <4 x i32> [[VEC_PHI]], [[TMP22]]
; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i1> [[TMP5]], i64 0
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]]
; CHECK: pred.store.continue:
; CHECK-NEXT: [[TMP27:%.*]] = extractelement <4 x i1> [[TMP5]], i64 1
-; CHECK-NEXT: br i1 [[TMP27]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9:%.*]]
+; CHECK-NEXT: br i1 [[TMP27]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11:%.*]]
; CHECK: pred.store.if8:
; CHECK-NEXT: [[TMP28:%.*]] = or i32 [[INDEX]], 1
; CHECK-NEXT: [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 [[TMP29]]
; CHECK-NEXT: store i32 [[TMP2]], i32* [[TMP30]], align 4
-; CHECK-NEXT: br label [[PRED_STORE_CONTINUE9]]
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE11]]
; CHECK: pred.store.continue9:
; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x i1> [[TMP5]], i64 2
-; CHECK-NEXT: br i1 [[TMP31]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11:%.*]]
+; CHECK-NEXT: br i1 [[TMP31]], label [[PRED_STORE_IF12:%.*]], label [[PRED_STORE_CONTINUE13:%.*]]
; CHECK: pred.store.if10:
; CHECK-NEXT: [[TMP32:%.*]] = or i32 [[INDEX]], 2
; CHECK-NEXT: [[TMP33:%.*]] = sext i32 [[TMP32]] to i64
; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 [[TMP33]]
; CHECK-NEXT: store i32 [[TMP3]], i32* [[TMP34]], align 4
-; CHECK-NEXT: br label [[PRED_STORE_CONTINUE11]]
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE13]]
; CHECK: pred.store.continue11:
; CHECK-NEXT: [[TMP35:%.*]] = extractelement <4 x i1> [[TMP5]], i64 3
-; CHECK-NEXT: br i1 [[TMP35]], label [[PRED_STORE_IF12:%.*]], label [[PRED_STORE_CONTINUE13]]
+; CHECK-NEXT: br i1 [[TMP35]], label [[PRED_STORE_IF14:%.*]], label [[PRED_STORE_CONTINUE15]]
; CHECK: pred.store.if12:
; CHECK-NEXT: [[TMP36:%.*]] = or i32 [[INDEX]], 3
; CHECK-NEXT: [[TMP37:%.*]] = sext i32 [[TMP36]] to i64
; CHECK-NEXT: [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 [[TMP37]]
; CHECK-NEXT: store i32 [[TMP4]], i32* [[TMP38]], align 4
-; CHECK-NEXT: br label [[PRED_STORE_CONTINUE13]]
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE15]]
; CHECK: pred.store.continue13:
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT: [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], <i32 4, i32 4, i32 4, i32 4>
; CHECK-NEXT: [[TMP39:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP39]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF52]], !llvm.loop [[LOOP56:![0-9]+]]
; CHECK: middle.block:
; UNROLL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
; UNROLL-NEXT: br label [[VECTOR_BODY:%.*]]
; UNROLL: vector.body:
-; UNROLL-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE31:%.*]] ]
-; UNROLL-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE31]] ]
-; UNROLL-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP42:%.*]], [[PRED_STORE_CONTINUE31]] ]
-; UNROLL-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP45:%.*]], [[PRED_STORE_CONTINUE31]] ]
-; UNROLL-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP46:%.*]], [[PRED_STORE_CONTINUE31]] ]
-; UNROLL-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; UNROLL-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE35:%.*]] ]
+; UNROLL-NEXT: [[VEC_IND3:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT6:%.*]], [[PRED_STORE_CONTINUE35]] ]
+; UNROLL-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP42:%.*]], [[PRED_STORE_CONTINUE35]] ]
+; UNROLL-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP45:%.*]], [[PRED_STORE_CONTINUE35]] ]
+; UNROLL-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP46:%.*]], [[PRED_STORE_CONTINUE35]] ]
+; UNROLL-NEXT: [[STEP_ADD4:%.*]] = add <4 x i32> [[VEC_IND3]], <i32 4, i32 4, i32 4, i32 4>
; UNROLL-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[Y]], [[INDEX]]
; UNROLL-NEXT: [[TMP2:%.*]] = add i32 [[OFFSET_IDX]], -1
; UNROLL-NEXT: [[TMP3:%.*]] = add i32 [[OFFSET_IDX]], -2
; UNROLL-NEXT: [[TMP6:%.*]] = add i32 [[OFFSET_IDX]], -5
; UNROLL-NEXT: [[TMP7:%.*]] = add i32 [[OFFSET_IDX]], -6
; UNROLL-NEXT: [[TMP8:%.*]] = add i32 [[OFFSET_IDX]], -7
-; UNROLL-NEXT: [[TMP9:%.*]] = icmp ule <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; UNROLL-NEXT: [[TMP10:%.*]] = icmp ule <4 x i32> [[STEP_ADD]], [[BROADCAST_SPLAT]]
+; UNROLL-NEXT: [[TMP9:%.*]] = icmp ule <4 x i32> [[VEC_IND3]], [[BROADCAST_SPLAT]]
+; UNROLL-NEXT: [[TMP10:%.*]] = icmp ule <4 x i32> [[STEP_ADD4]], [[BROADCAST_SPLAT]]
; UNROLL-NEXT: [[TMP11:%.*]] = extractelement <4 x i1> [[TMP9]], i64 0
; UNROLL-NEXT: br i1 [[TMP11]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
; UNROLL: pred.udiv.if:
; UNROLL: pred.udiv.continue:
; UNROLL-NEXT: [[TMP14:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP13]], [[PRED_UDIV_IF]] ]
; UNROLL-NEXT: [[TMP15:%.*]] = extractelement <4 x i1> [[TMP9]], i64 1
-; UNROLL-NEXT: br i1 [[TMP15]], label [[PRED_UDIV_IF4:%.*]], label [[PRED_UDIV_CONTINUE5:%.*]]
+; UNROLL-NEXT: br i1 [[TMP15]], label [[PRED_UDIV_IF8:%.*]], label [[PRED_UDIV_CONTINUE9:%.*]]
; UNROLL: pred.udiv.if4:
; UNROLL-NEXT: [[TMP16:%.*]] = udiv i32 219220132, [[TMP2]]
; UNROLL-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP16]], i64 1
-; UNROLL-NEXT: br label [[PRED_UDIV_CONTINUE5]]
+; UNROLL-NEXT: br label [[PRED_UDIV_CONTINUE9]]
; UNROLL: pred.udiv.continue5:
-; UNROLL-NEXT: [[TMP18:%.*]] = phi <4 x i32> [ [[TMP14]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP17]], [[PRED_UDIV_IF4]] ]
+; UNROLL-NEXT: [[TMP18:%.*]] = phi <4 x i32> [ [[TMP14]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP17]], [[PRED_UDIV_IF8]] ]
; UNROLL-NEXT: [[TMP19:%.*]] = extractelement <4 x i1> [[TMP9]], i64 2
-; UNROLL-NEXT: br i1 [[TMP19]], label [[PRED_UDIV_IF6:%.*]], label [[PRED_UDIV_CONTINUE7:%.*]]
+; UNROLL-NEXT: br i1 [[TMP19]], label [[PRED_UDIV_IF10:%.*]], label [[PRED_UDIV_CONTINUE11:%.*]]
; UNROLL: pred.udiv.if6:
; UNROLL-NEXT: [[TMP20:%.*]] = udiv i32 219220132, [[TMP3]]
; UNROLL-NEXT: [[TMP21:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[TMP20]], i64 2
-; UNROLL-NEXT: br label [[PRED_UDIV_CONTINUE7]]
+; UNROLL-NEXT: br label [[PRED_UDIV_CONTINUE11]]
; UNROLL: pred.udiv.continue7:
-; UNROLL-NEXT: [[TMP22:%.*]] = phi <4 x i32> [ [[TMP18]], [[PRED_UDIV_CONTINUE5]] ], [ [[TMP21]], [[PRED_UDIV_IF6]] ]
+; UNROLL-NEXT: [[TMP22:%.*]] = phi <4 x i32> [ [[TMP18]], [[PRED_UDIV_CONTINUE9]] ], [ [[TMP21]], [[PRED_UDIV_IF10]] ]
; UNROLL-NEXT: [[TMP23:%.*]] = extractelement <4 x i1> [[TMP9]], i64 3
-; UNROLL-NEXT: br i1 [[TMP23]], label [[PRED_UDIV_IF8:%.*]], label [[PRED_UDIV_CONTINUE9:%.*]]
+; UNROLL-NEXT: br i1 [[TMP23]], label [[PRED_UDIV_IF12:%.*]], label [[PRED_UDIV_CONTINUE13:%.*]]
; UNROLL: pred.udiv.if8:
; UNROLL-NEXT: [[TMP24:%.*]] = udiv i32 219220132, [[TMP4]]
; UNROLL-NEXT: [[TMP25:%.*]] = insertelement <4 x i32> [[TMP22]], i32 [[TMP24]], i64 3
-; UNROLL-NEXT: br label [[PRED_UDIV_CONTINUE9]]
+; UNROLL-NEXT: br label [[PRED_UDIV_CONTINUE13]]
; UNROLL: pred.udiv.continue9:
-; UNROLL-NEXT: [[TMP26:%.*]] = phi <4 x i32> [ [[TMP22]], [[PRED_UDIV_CONTINUE7]] ], [ [[TMP25]], [[PRED_UDIV_IF8]] ]
+; UNROLL-NEXT: [[TMP26:%.*]] = phi <4 x i32> [ [[TMP22]], [[PRED_UDIV_CONTINUE11]] ], [ [[TMP25]], [[PRED_UDIV_IF12]] ]
; UNROLL-NEXT: [[TMP27:%.*]] = extractelement <4 x i1> [[TMP10]], i64 0
-; UNROLL-NEXT: br i1 [[TMP27]], label [[PRED_UDIV_IF10:%.*]], label [[PRED_UDIV_CONTINUE11:%.*]]
+; UNROLL-NEXT: br i1 [[TMP27]], label [[PRED_UDIV_IF14:%.*]], label [[PRED_UDIV_CONTINUE15:%.*]]
; UNROLL: pred.udiv.if10:
; UNROLL-NEXT: [[TMP28:%.*]] = udiv i32 219220132, [[TMP5]]
; UNROLL-NEXT: [[TMP29:%.*]] = insertelement <4 x i32> poison, i32 [[TMP28]], i64 0
-; UNROLL-NEXT: br label [[PRED_UDIV_CONTINUE11]]
+; UNROLL-NEXT: br label [[PRED_UDIV_CONTINUE15]]
; UNROLL: pred.udiv.continue11:
-; UNROLL-NEXT: [[TMP30:%.*]] = phi <4 x i32> [ poison, [[PRED_UDIV_CONTINUE9]] ], [ [[TMP29]], [[PRED_UDIV_IF10]] ]
+; UNROLL-NEXT: [[TMP30:%.*]] = phi <4 x i32> [ poison, [[PRED_UDIV_CONTINUE13]] ], [ [[TMP29]], [[PRED_UDIV_IF14]] ]
; UNROLL-NEXT: [[TMP31:%.*]] = extractelement <4 x i1> [[TMP10]], i64 1
-; UNROLL-NEXT: br i1 [[TMP31]], label [[PRED_UDIV_IF12:%.*]], label [[PRED_UDIV_CONTINUE13:%.*]]
+; UNROLL-NEXT: br i1 [[TMP31]], label [[PRED_UDIV_IF16:%.*]], label [[PRED_UDIV_CONTINUE17:%.*]]
; UNROLL: pred.udiv.if12:
; UNROLL-NEXT: [[TMP32:%.*]] = udiv i32 219220132, [[TMP6]]
; UNROLL-NEXT: [[TMP33:%.*]] = insertelement <4 x i32> [[TMP30]], i32 [[TMP32]], i64 1
-; UNROLL-NEXT: br label [[PRED_UDIV_CONTINUE13]]
+; UNROLL-NEXT: br label [[PRED_UDIV_CONTINUE17]]
; UNROLL: pred.udiv.continue13:
-; UNROLL-NEXT: [[TMP34:%.*]] = phi <4 x i32> [ [[TMP30]], [[PRED_UDIV_CONTINUE11]] ], [ [[TMP33]], [[PRED_UDIV_IF12]] ]
+; UNROLL-NEXT: [[TMP34:%.*]] = phi <4 x i32> [ [[TMP30]], [[PRED_UDIV_CONTINUE15]] ], [ [[TMP33]], [[PRED_UDIV_IF16]] ]
; UNROLL-NEXT: [[TMP35:%.*]] = extractelement <4 x i1> [[TMP10]], i64 2
-; UNROLL-NEXT: br i1 [[TMP35]], label [[PRED_UDIV_IF14:%.*]], label [[PRED_UDIV_CONTINUE15:%.*]]
+; UNROLL-NEXT: br i1 [[TMP35]], label [[PRED_UDIV_IF18:%.*]], label [[PRED_UDIV_CONTINUE19:%.*]]
; UNROLL: pred.udiv.if14:
; UNROLL-NEXT: [[TMP36:%.*]] = udiv i32 219220132, [[TMP7]]
; UNROLL-NEXT: [[TMP37:%.*]] = insertelement <4 x i32> [[TMP34]], i32 [[TMP36]], i64 2
-; UNROLL-NEXT: br label [[PRED_UDIV_CONTINUE15]]
+; UNROLL-NEXT: br label [[PRED_UDIV_CONTINUE19]]
; UNROLL: pred.udiv.continue15:
-; UNROLL-NEXT: [[TMP38:%.*]] = phi <4 x i32> [ [[TMP34]], [[PRED_UDIV_CONTINUE13]] ], [ [[TMP37]], [[PRED_UDIV_IF14]] ]
+; UNROLL-NEXT: [[TMP38:%.*]] = phi <4 x i32> [ [[TMP34]], [[PRED_UDIV_CONTINUE17]] ], [ [[TMP37]], [[PRED_UDIV_IF18]] ]
; UNROLL-NEXT: [[TMP39:%.*]] = extractelement <4 x i1> [[TMP10]], i64 3
-; UNROLL-NEXT: br i1 [[TMP39]], label [[PRED_UDIV_IF16:%.*]], label [[PRED_UDIV_CONTINUE17:%.*]]
+; UNROLL-NEXT: br i1 [[TMP39]], label [[PRED_UDIV_IF20:%.*]], label [[PRED_UDIV_CONTINUE21:%.*]]
; UNROLL: pred.udiv.if16:
; UNROLL-NEXT: [[TMP40:%.*]] = udiv i32 219220132, [[TMP8]]
; UNROLL-NEXT: [[TMP41:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP40]], i64 3
-; UNROLL-NEXT: br label [[PRED_UDIV_CONTINUE17]]
+; UNROLL-NEXT: br label [[PRED_UDIV_CONTINUE21]]
; UNROLL: pred.udiv.continue17:
-; UNROLL-NEXT: [[TMP42]] = phi <4 x i32> [ [[TMP38]], [[PRED_UDIV_CONTINUE15]] ], [ [[TMP41]], [[PRED_UDIV_IF16]] ]
+; UNROLL-NEXT: [[TMP42]] = phi <4 x i32> [ [[TMP38]], [[PRED_UDIV_CONTINUE19]] ], [ [[TMP41]], [[PRED_UDIV_IF20]] ]
; UNROLL-NEXT: [[TMP43:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP26]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; UNROLL-NEXT: [[TMP44:%.*]] = shufflevector <4 x i32> [[TMP26]], <4 x i32> [[TMP42]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; UNROLL-NEXT: [[TMP45]] = add <4 x i32> [[VEC_PHI]], [[TMP43]]
-; UNROLL-NEXT: [[TMP46]] = add <4 x i32> [[VEC_PHI3]], [[TMP44]]
+; UNROLL-NEXT: [[TMP46]] = add <4 x i32> [[VEC_PHI7]], [[TMP44]]
; UNROLL-NEXT: [[TMP47:%.*]] = extractelement <4 x i1> [[TMP9]], i64 0
; UNROLL-NEXT: br i1 [[TMP47]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
; UNROLL: pred.store.if:
; UNROLL-NEXT: br label [[PRED_STORE_CONTINUE]]
; UNROLL: pred.store.continue:
; UNROLL-NEXT: [[TMP50:%.*]] = extractelement <4 x i1> [[TMP9]], i64 1
-; UNROLL-NEXT: br i1 [[TMP50]], label [[PRED_STORE_IF18:%.*]], label [[PRED_STORE_CONTINUE19:%.*]]
+; UNROLL-NEXT: br i1 [[TMP50]], label [[PRED_STORE_IF22:%.*]], label [[PRED_STORE_CONTINUE23:%.*]]
; UNROLL: pred.store.if18:
; UNROLL-NEXT: [[TMP51:%.*]] = or i32 [[INDEX]], 1
; UNROLL-NEXT: [[TMP52:%.*]] = sext i32 [[TMP51]] to i64
; UNROLL-NEXT: [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 [[TMP52]]
; UNROLL-NEXT: store i32 [[TMP2]], i32* [[TMP53]], align 4
-; UNROLL-NEXT: br label [[PRED_STORE_CONTINUE19]]
+; UNROLL-NEXT: br label [[PRED_STORE_CONTINUE23]]
; UNROLL: pred.store.continue19:
; UNROLL-NEXT: [[TMP54:%.*]] = extractelement <4 x i1> [[TMP9]], i64 2
-; UNROLL-NEXT: br i1 [[TMP54]], label [[PRED_STORE_IF20:%.*]], label [[PRED_STORE_CONTINUE21:%.*]]
+; UNROLL-NEXT: br i1 [[TMP54]], label [[PRED_STORE_IF24:%.*]], label [[PRED_STORE_CONTINUE25:%.*]]
; UNROLL: pred.store.if20:
; UNROLL-NEXT: [[TMP55:%.*]] = or i32 [[INDEX]], 2
; UNROLL-NEXT: [[TMP56:%.*]] = sext i32 [[TMP55]] to i64
; UNROLL-NEXT: [[TMP57:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 [[TMP56]]
; UNROLL-NEXT: store i32 [[TMP3]], i32* [[TMP57]], align 4
-; UNROLL-NEXT: br label [[PRED_STORE_CONTINUE21]]
+; UNROLL-NEXT: br label [[PRED_STORE_CONTINUE25]]
; UNROLL: pred.store.continue21:
; UNROLL-NEXT: [[TMP58:%.*]] = extractelement <4 x i1> [[TMP9]], i64 3
-; UNROLL-NEXT: br i1 [[TMP58]], label [[PRED_STORE_IF22:%.*]], label [[PRED_STORE_CONTINUE23:%.*]]
+; UNROLL-NEXT: br i1 [[TMP58]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27:%.*]]
; UNROLL: pred.store.if22:
; UNROLL-NEXT: [[TMP59:%.*]] = or i32 [[INDEX]], 3
; UNROLL-NEXT: [[TMP60:%.*]] = sext i32 [[TMP59]] to i64
; UNROLL-NEXT: [[TMP61:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 [[TMP60]]
; UNROLL-NEXT: store i32 [[TMP4]], i32* [[TMP61]], align 4
-; UNROLL-NEXT: br label [[PRED_STORE_CONTINUE23]]
+; UNROLL-NEXT: br label [[PRED_STORE_CONTINUE27]]
; UNROLL: pred.store.continue23:
; UNROLL-NEXT: [[TMP62:%.*]] = extractelement <4 x i1> [[TMP10]], i64 0
-; UNROLL-NEXT: br i1 [[TMP62]], label [[PRED_STORE_IF24:%.*]], label [[PRED_STORE_CONTINUE25:%.*]]
+; UNROLL-NEXT: br i1 [[TMP62]], label [[PRED_STORE_IF28:%.*]], label [[PRED_STORE_CONTINUE29:%.*]]
; UNROLL: pred.store.if24:
; UNROLL-NEXT: [[TMP63:%.*]] = or i32 [[INDEX]], 4
; UNROLL-NEXT: [[TMP64:%.*]] = sext i32 [[TMP63]] to i64
; UNROLL-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 [[TMP64]]
; UNROLL-NEXT: store i32 [[TMP5]], i32* [[TMP65]], align 4
-; UNROLL-NEXT: br label [[PRED_STORE_CONTINUE25]]
+; UNROLL-NEXT: br label [[PRED_STORE_CONTINUE29]]
; UNROLL: pred.store.continue25:
; UNROLL-NEXT: [[TMP66:%.*]] = extractelement <4 x i1> [[TMP10]], i64 1
-; UNROLL-NEXT: br i1 [[TMP66]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27:%.*]]
+; UNROLL-NEXT: br i1 [[TMP66]], label [[PRED_STORE_IF30:%.*]], label [[PRED_STORE_CONTINUE31:%.*]]
; UNROLL: pred.store.if26:
; UNROLL-NEXT: [[TMP67:%.*]] = or i32 [[INDEX]], 5
; UNROLL-NEXT: [[TMP68:%.*]] = sext i32 [[TMP67]] to i64
; UNROLL-NEXT: [[TMP69:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 [[TMP68]]
; UNROLL-NEXT: store i32 [[TMP6]], i32* [[TMP69]], align 4
-; UNROLL-NEXT: br label [[PRED_STORE_CONTINUE27]]
+; UNROLL-NEXT: br label [[PRED_STORE_CONTINUE31]]
; UNROLL: pred.store.continue27:
; UNROLL-NEXT: [[TMP70:%.*]] = extractelement <4 x i1> [[TMP10]], i64 2
-; UNROLL-NEXT: br i1 [[TMP70]], label [[PRED_STORE_IF28:%.*]], label [[PRED_STORE_CONTINUE29:%.*]]
+; UNROLL-NEXT: br i1 [[TMP70]], label [[PRED_STORE_IF32:%.*]], label [[PRED_STORE_CONTINUE33:%.*]]
; UNROLL: pred.store.if28:
; UNROLL-NEXT: [[TMP71:%.*]] = or i32 [[INDEX]], 6
; UNROLL-NEXT: [[TMP72:%.*]] = sext i32 [[TMP71]] to i64
; UNROLL-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 [[TMP72]]
; UNROLL-NEXT: store i32 [[TMP7]], i32* [[TMP73]], align 4
-; UNROLL-NEXT: br label [[PRED_STORE_CONTINUE29]]
+; UNROLL-NEXT: br label [[PRED_STORE_CONTINUE33]]
; UNROLL: pred.store.continue29:
; UNROLL-NEXT: [[TMP74:%.*]] = extractelement <4 x i1> [[TMP10]], i64 3
-; UNROLL-NEXT: br i1 [[TMP74]], label [[PRED_STORE_IF30:%.*]], label [[PRED_STORE_CONTINUE31]]
+; UNROLL-NEXT: br i1 [[TMP74]], label [[PRED_STORE_IF34:%.*]], label [[PRED_STORE_CONTINUE35]]
; UNROLL: pred.store.if30:
; UNROLL-NEXT: [[TMP75:%.*]] = or i32 [[INDEX]], 7
; UNROLL-NEXT: [[TMP76:%.*]] = sext i32 [[TMP75]] to i64
; UNROLL-NEXT: [[TMP77:%.*]] = getelementptr inbounds i32, i32* [[X]], i64 [[TMP76]]
; UNROLL-NEXT: store i32 [[TMP8]], i32* [[TMP77]], align 4
-; UNROLL-NEXT: br label [[PRED_STORE_CONTINUE31]]
+; UNROLL-NEXT: br label [[PRED_STORE_CONTINUE35]]
; UNROLL: pred.store.continue31:
; UNROLL-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8
-; UNROLL-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8>
+; UNROLL-NEXT: [[VEC_IND_NEXT6]] = add <4 x i32> [[VEC_IND3]], <i32 8, i32 8, i32 8, i32 8>
; UNROLL-NEXT: [[TMP78:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; UNROLL-NEXT: br i1 [[TMP78]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF52]], !llvm.loop [[LOOP56:![0-9]+]]
; UNROLL: middle.block:
-; UNROLL-NEXT: [[TMP79:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[TMP46]], <4 x i32> [[VEC_PHI3]]
+; UNROLL-NEXT: [[TMP79:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[TMP46]], <4 x i32> [[VEC_PHI7]]
; UNROLL-NEXT: [[TMP80:%.*]] = select <4 x i1> [[TMP9]], <4 x i32> [[TMP45]], <4 x i32> [[VEC_PHI]]
; UNROLL-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP79]], [[TMP80]]
; UNROLL-NEXT: [[TMP81:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]])
; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
; UNROLL-NO-IC-NEXT: br label [[VECTOR_BODY:%.*]]
; UNROLL-NO-IC: vector.body:
-; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE31:%.*]] ]
-; UNROLL-NO-IC-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE31]] ]
-; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP43:%.*]], [[PRED_STORE_CONTINUE31]] ]
-; UNROLL-NO-IC-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP46:%.*]], [[PRED_STORE_CONTINUE31]] ]
-; UNROLL-NO-IC-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP47:%.*]], [[PRED_STORE_CONTINUE31]] ]
-; UNROLL-NO-IC-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE35:%.*]] ]
+; UNROLL-NO-IC-NEXT: [[VEC_IND3:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT6:%.*]], [[PRED_STORE_CONTINUE35]] ]
+; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP43:%.*]], [[PRED_STORE_CONTINUE35]] ]
+; UNROLL-NO-IC-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP46:%.*]], [[PRED_STORE_CONTINUE35]] ]
+; UNROLL-NO-IC-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP47:%.*]], [[PRED_STORE_CONTINUE35]] ]
+; UNROLL-NO-IC-NEXT: [[STEP_ADD4:%.*]] = add <4 x i32> [[VEC_IND3]], <i32 4, i32 4, i32 4, i32 4>
; UNROLL-NO-IC-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[Y]], [[INDEX]]
; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = add i32 [[OFFSET_IDX]], 0
; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = add i32 [[OFFSET_IDX]], -1
; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = add i32 [[OFFSET_IDX]], -5
; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = add i32 [[OFFSET_IDX]], -6
; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = add i32 [[OFFSET_IDX]], -7
-; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = icmp ule <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = icmp ule <4 x i32> [[STEP_ADD]], [[BROADCAST_SPLAT]]
+; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = icmp ule <4 x i32> [[VEC_IND3]], [[BROADCAST_SPLAT]]
+; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = icmp ule <4 x i32> [[STEP_ADD4]], [[BROADCAST_SPLAT]]
; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = extractelement <4 x i1> [[TMP10]], i32 0
; UNROLL-NO-IC-NEXT: br i1 [[TMP12]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
; UNROLL-NO-IC: pred.udiv.if:
; UNROLL-NO-IC: pred.udiv.continue:
; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP14]], [[PRED_UDIV_IF]] ]
; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = extractelement <4 x i1> [[TMP10]], i32 1
-; UNROLL-NO-IC-NEXT: br i1 [[TMP16]], label [[PRED_UDIV_IF4:%.*]], label [[PRED_UDIV_CONTINUE5:%.*]]
+; UNROLL-NO-IC-NEXT: br i1 [[TMP16]], label [[PRED_UDIV_IF8:%.*]], label [[PRED_UDIV_CONTINUE9:%.*]]
; UNROLL-NO-IC: pred.udiv.if4:
; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = udiv i32 219220132, [[TMP3]]
; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP17]], i32 1
-; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE5]]
+; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE9]]
; UNROLL-NO-IC: pred.udiv.continue5:
-; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = phi <4 x i32> [ [[TMP15]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP18]], [[PRED_UDIV_IF4]] ]
+; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = phi <4 x i32> [ [[TMP15]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP18]], [[PRED_UDIV_IF8]] ]
; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = extractelement <4 x i1> [[TMP10]], i32 2
-; UNROLL-NO-IC-NEXT: br i1 [[TMP20]], label [[PRED_UDIV_IF6:%.*]], label [[PRED_UDIV_CONTINUE7:%.*]]
+; UNROLL-NO-IC-NEXT: br i1 [[TMP20]], label [[PRED_UDIV_IF10:%.*]], label [[PRED_UDIV_CONTINUE11:%.*]]
; UNROLL-NO-IC: pred.udiv.if6:
; UNROLL-NO-IC-NEXT: [[TMP21:%.*]] = udiv i32 219220132, [[TMP4]]
; UNROLL-NO-IC-NEXT: [[TMP22:%.*]] = insertelement <4 x i32> [[TMP19]], i32 [[TMP21]], i32 2
-; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE7]]
+; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE11]]
; UNROLL-NO-IC: pred.udiv.continue7:
-; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = phi <4 x i32> [ [[TMP19]], [[PRED_UDIV_CONTINUE5]] ], [ [[TMP22]], [[PRED_UDIV_IF6]] ]
+; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = phi <4 x i32> [ [[TMP19]], [[PRED_UDIV_CONTINUE9]] ], [ [[TMP22]], [[PRED_UDIV_IF10]] ]
; UNROLL-NO-IC-NEXT: [[TMP24:%.*]] = extractelement <4 x i1> [[TMP10]], i32 3
-; UNROLL-NO-IC-NEXT: br i1 [[TMP24]], label [[PRED_UDIV_IF8:%.*]], label [[PRED_UDIV_CONTINUE9:%.*]]
+; UNROLL-NO-IC-NEXT: br i1 [[TMP24]], label [[PRED_UDIV_IF12:%.*]], label [[PRED_UDIV_CONTINUE13:%.*]]
; UNROLL-NO-IC: pred.udiv.if8:
; UNROLL-NO-IC-NEXT: [[TMP25:%.*]] = udiv i32 219220132, [[TMP5]]
; UNROLL-NO-IC-NEXT: [[TMP26:%.*]] = insertelement <4 x i32> [[TMP23]], i32 [[TMP25]], i32 3
-; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE9]]
+; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE13]]
; UNROLL-NO-IC: pred.udiv.continue9:
-; UNROLL-NO-IC-NEXT: [[TMP27:%.*]] = phi <4 x i32> [ [[TMP23]], [[PRED_UDIV_CONTINUE7]] ], [ [[TMP26]], [[PRED_UDIV_IF8]] ]
+; UNROLL-NO-IC-NEXT: [[TMP27:%.*]] = phi <4 x i32> [ [[TMP23]], [[PRED_UDIV_CONTINUE11]] ], [ [[TMP26]], [[PRED_UDIV_IF12]] ]
; UNROLL-NO-IC-NEXT: [[TMP28:%.*]] = extractelement <4 x i1> [[TMP11]], i32 0
-; UNROLL-NO-IC-NEXT: br i1 [[TMP28]], label [[PRED_UDIV_IF10:%.*]], label [[PRED_UDIV_CONTINUE11:%.*]]
+; UNROLL-NO-IC-NEXT: br i1 [[TMP28]], label [[PRED_UDIV_IF14:%.*]], label [[PRED_UDIV_CONTINUE15:%.*]]
; UNROLL-NO-IC: pred.udiv.if10:
; UNROLL-NO-IC-NEXT: [[TMP29:%.*]] = udiv i32 219220132, [[TMP6]]
; UNROLL-NO-IC-NEXT: [[TMP30:%.*]] = insertelement <4 x i32> poison, i32 [[TMP29]], i32 0
-; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE11]]
+; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE15]]
; UNROLL-NO-IC: pred.udiv.continue11:
-; UNROLL-NO-IC-NEXT: [[TMP31:%.*]] = phi <4 x i32> [ poison, [[PRED_UDIV_CONTINUE9]] ], [ [[TMP30]], [[PRED_UDIV_IF10]] ]
+; UNROLL-NO-IC-NEXT: [[TMP31:%.*]] = phi <4 x i32> [ poison, [[PRED_UDIV_CONTINUE13]] ], [ [[TMP30]], [[PRED_UDIV_IF14]] ]
; UNROLL-NO-IC-NEXT: [[TMP32:%.*]] = extractelement <4 x i1> [[TMP11]], i32 1
-; UNROLL-NO-IC-NEXT: br i1 [[TMP32]], label [[PRED_UDIV_IF12:%.*]], label [[PRED_UDIV_CONTINUE13:%.*]]
+; UNROLL-NO-IC-NEXT: br i1 [[TMP32]], label [[PRED_UDIV_IF16:%.*]], label [[PRED_UDIV_CONTINUE17:%.*]]
; UNROLL-NO-IC: pred.udiv.if12:
; UNROLL-NO-IC-NEXT: [[TMP33:%.*]] = udiv i32 219220132, [[TMP7]]
; UNROLL-NO-IC-NEXT: [[TMP34:%.*]] = insertelement <4 x i32> [[TMP31]], i32 [[TMP33]], i32 1
-; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE13]]
+; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE17]]
; UNROLL-NO-IC: pred.udiv.continue13:
-; UNROLL-NO-IC-NEXT: [[TMP35:%.*]] = phi <4 x i32> [ [[TMP31]], [[PRED_UDIV_CONTINUE11]] ], [ [[TMP34]], [[PRED_UDIV_IF12]] ]
+; UNROLL-NO-IC-NEXT: [[TMP35:%.*]] = phi <4 x i32> [ [[TMP31]], [[PRED_UDIV_CONTINUE15]] ], [ [[TMP34]], [[PRED_UDIV_IF16]] ]
; UNROLL-NO-IC-NEXT: [[TMP36:%.*]] = extractelement <4 x i1> [[TMP11]], i32 2
-; UNROLL-NO-IC-NEXT: br i1 [[TMP36]], label [[PRED_UDIV_IF14:%.*]], label [[PRED_UDIV_CONTINUE15:%.*]]
+; UNROLL-NO-IC-NEXT: br i1 [[TMP36]], label [[PRED_UDIV_IF18:%.*]], label [[PRED_UDIV_CONTINUE19:%.*]]
; UNROLL-NO-IC: pred.udiv.if14:
; UNROLL-NO-IC-NEXT: [[TMP37:%.*]] = udiv i32 219220132, [[TMP8]]
; UNROLL-NO-IC-NEXT: [[TMP38:%.*]] = insertelement <4 x i32> [[TMP35]], i32 [[TMP37]], i32 2
-; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE15]]
+; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE19]]
; UNROLL-NO-IC: pred.udiv.continue15:
-; UNROLL-NO-IC-NEXT: [[TMP39:%.*]] = phi <4 x i32> [ [[TMP35]], [[PRED_UDIV_CONTINUE13]] ], [ [[TMP38]], [[PRED_UDIV_IF14]] ]
+; UNROLL-NO-IC-NEXT: [[TMP39:%.*]] = phi <4 x i32> [ [[TMP35]], [[PRED_UDIV_CONTINUE17]] ], [ [[TMP38]], [[PRED_UDIV_IF18]] ]
; UNROLL-NO-IC-NEXT: [[TMP40:%.*]] = extractelement <4 x i1> [[TMP11]], i32 3
-; UNROLL-NO-IC-NEXT: br i1 [[TMP40]], label [[PRED_UDIV_IF16:%.*]], label [[PRED_UDIV_CONTINUE17:%.*]]
+; UNROLL-NO-IC-NEXT: br i1 [[TMP40]], label [[PRED_UDIV_IF20:%.*]], label [[PRED_UDIV_CONTINUE21:%.*]]
; UNROLL-NO-IC: pred.udiv.if16:
; UNROLL-NO-IC-NEXT: [[TMP41:%.*]] = udiv i32 219220132, [[TMP9]]
; UNROLL-NO-IC-NEXT: [[TMP42:%.*]] = insertelement <4 x i32> [[TMP39]], i32 [[TMP41]], i32 3
-; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE17]]
+; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE21]]
; UNROLL-NO-IC: pred.udiv.continue17:
-; UNROLL-NO-IC-NEXT: [[TMP43]] = phi <4 x i32> [ [[TMP39]], [[PRED_UDIV_CONTINUE15]] ], [ [[TMP42]], [[PRED_UDIV_IF16]] ]
+; UNROLL-NO-IC-NEXT: [[TMP43]] = phi <4 x i32> [ [[TMP39]], [[PRED_UDIV_CONTINUE19]] ], [ [[TMP42]], [[PRED_UDIV_IF20]] ]
; UNROLL-NO-IC-NEXT: [[TMP44:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP27]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; UNROLL-NO-IC-NEXT: [[TMP45:%.*]] = shufflevector <4 x i32> [[TMP27]], <4 x i32> [[TMP43]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; UNROLL-NO-IC-NEXT: [[TMP46]] = add <4 x i32> [[VEC_PHI]], [[TMP44]]
-; UNROLL-NO-IC-NEXT: [[TMP47]] = add <4 x i32> [[VEC_PHI3]], [[TMP45]]
+; UNROLL-NO-IC-NEXT: [[TMP47]] = add <4 x i32> [[VEC_PHI7]], [[TMP45]]
; UNROLL-NO-IC-NEXT: [[TMP48:%.*]] = extractelement <4 x i1> [[TMP10]], i32 0
; UNROLL-NO-IC-NEXT: br i1 [[TMP48]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
; UNROLL-NO-IC: pred.store.if:
; UNROLL-NO-IC-NEXT: br label [[PRED_STORE_CONTINUE]]
; UNROLL-NO-IC: pred.store.continue:
; UNROLL-NO-IC-NEXT: [[TMP51:%.*]] = extractelement <4 x i1> [[TMP10]], i32 1
-; UNROLL-NO-IC-NEXT: br i1 [[TMP51]], label [[PRED_STORE_IF18:%.*]], label [[PRED_STORE_CONTINUE19:%.*]]
+; UNROLL-NO-IC-NEXT: br i1 [[TMP51]], label [[PRED_STORE_IF22:%.*]], label [[PRED_STORE_CONTINUE23:%.*]]
; UNROLL-NO-IC: pred.store.if18:
; UNROLL-NO-IC-NEXT: [[TMP52:%.*]] = add i32 [[INDEX]], 1
; UNROLL-NO-IC-NEXT: [[TMP53:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[TMP52]]
; UNROLL-NO-IC-NEXT: store i32 [[TMP3]], i32* [[TMP53]], align 4
-; UNROLL-NO-IC-NEXT: br label [[PRED_STORE_CONTINUE19]]
+; UNROLL-NO-IC-NEXT: br label [[PRED_STORE_CONTINUE23]]
; UNROLL-NO-IC: pred.store.continue19:
; UNROLL-NO-IC-NEXT: [[TMP54:%.*]] = extractelement <4 x i1> [[TMP10]], i32 2
-; UNROLL-NO-IC-NEXT: br i1 [[TMP54]], label [[PRED_STORE_IF20:%.*]], label [[PRED_STORE_CONTINUE21:%.*]]
+; UNROLL-NO-IC-NEXT: br i1 [[TMP54]], label [[PRED_STORE_IF24:%.*]], label [[PRED_STORE_CONTINUE25:%.*]]
; UNROLL-NO-IC: pred.store.if20:
; UNROLL-NO-IC-NEXT: [[TMP55:%.*]] = add i32 [[INDEX]], 2
; UNROLL-NO-IC-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[TMP55]]
; UNROLL-NO-IC-NEXT: store i32 [[TMP4]], i32* [[TMP56]], align 4
-; UNROLL-NO-IC-NEXT: br label [[PRED_STORE_CONTINUE21]]
+; UNROLL-NO-IC-NEXT: br label [[PRED_STORE_CONTINUE25]]
; UNROLL-NO-IC: pred.store.continue21:
; UNROLL-NO-IC-NEXT: [[TMP57:%.*]] = extractelement <4 x i1> [[TMP10]], i32 3
-; UNROLL-NO-IC-NEXT: br i1 [[TMP57]], label [[PRED_STORE_IF22:%.*]], label [[PRED_STORE_CONTINUE23:%.*]]
+; UNROLL-NO-IC-NEXT: br i1 [[TMP57]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27:%.*]]
; UNROLL-NO-IC: pred.store.if22:
; UNROLL-NO-IC-NEXT: [[TMP58:%.*]] = add i32 [[INDEX]], 3
; UNROLL-NO-IC-NEXT: [[TMP59:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[TMP58]]
; UNROLL-NO-IC-NEXT: store i32 [[TMP5]], i32* [[TMP59]], align 4
-; UNROLL-NO-IC-NEXT: br label [[PRED_STORE_CONTINUE23]]
+; UNROLL-NO-IC-NEXT: br label [[PRED_STORE_CONTINUE27]]
; UNROLL-NO-IC: pred.store.continue23:
; UNROLL-NO-IC-NEXT: [[TMP60:%.*]] = extractelement <4 x i1> [[TMP11]], i32 0
-; UNROLL-NO-IC-NEXT: br i1 [[TMP60]], label [[PRED_STORE_IF24:%.*]], label [[PRED_STORE_CONTINUE25:%.*]]
+; UNROLL-NO-IC-NEXT: br i1 [[TMP60]], label [[PRED_STORE_IF28:%.*]], label [[PRED_STORE_CONTINUE29:%.*]]
; UNROLL-NO-IC: pred.store.if24:
; UNROLL-NO-IC-NEXT: [[TMP61:%.*]] = add i32 [[INDEX]], 4
; UNROLL-NO-IC-NEXT: [[TMP62:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[TMP61]]
; UNROLL-NO-IC-NEXT: store i32 [[TMP6]], i32* [[TMP62]], align 4
-; UNROLL-NO-IC-NEXT: br label [[PRED_STORE_CONTINUE25]]
+; UNROLL-NO-IC-NEXT: br label [[PRED_STORE_CONTINUE29]]
; UNROLL-NO-IC: pred.store.continue25:
; UNROLL-NO-IC-NEXT: [[TMP63:%.*]] = extractelement <4 x i1> [[TMP11]], i32 1
-; UNROLL-NO-IC-NEXT: br i1 [[TMP63]], label [[PRED_STORE_IF26:%.*]], label [[PRED_STORE_CONTINUE27:%.*]]
+; UNROLL-NO-IC-NEXT: br i1 [[TMP63]], label [[PRED_STORE_IF30:%.*]], label [[PRED_STORE_CONTINUE31:%.*]]
; UNROLL-NO-IC: pred.store.if26:
; UNROLL-NO-IC-NEXT: [[TMP64:%.*]] = add i32 [[INDEX]], 5
; UNROLL-NO-IC-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[TMP64]]
; UNROLL-NO-IC-NEXT: store i32 [[TMP7]], i32* [[TMP65]], align 4
-; UNROLL-NO-IC-NEXT: br label [[PRED_STORE_CONTINUE27]]
+; UNROLL-NO-IC-NEXT: br label [[PRED_STORE_CONTINUE31]]
; UNROLL-NO-IC: pred.store.continue27:
; UNROLL-NO-IC-NEXT: [[TMP66:%.*]] = extractelement <4 x i1> [[TMP11]], i32 2
-; UNROLL-NO-IC-NEXT: br i1 [[TMP66]], label [[PRED_STORE_IF28:%.*]], label [[PRED_STORE_CONTINUE29:%.*]]
+; UNROLL-NO-IC-NEXT: br i1 [[TMP66]], label [[PRED_STORE_IF32:%.*]], label [[PRED_STORE_CONTINUE33:%.*]]
; UNROLL-NO-IC: pred.store.if28:
; UNROLL-NO-IC-NEXT: [[TMP67:%.*]] = add i32 [[INDEX]], 6
; UNROLL-NO-IC-NEXT: [[TMP68:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[TMP67]]
; UNROLL-NO-IC-NEXT: store i32 [[TMP8]], i32* [[TMP68]], align 4
-; UNROLL-NO-IC-NEXT: br label [[PRED_STORE_CONTINUE29]]
+; UNROLL-NO-IC-NEXT: br label [[PRED_STORE_CONTINUE33]]
; UNROLL-NO-IC: pred.store.continue29:
; UNROLL-NO-IC-NEXT: [[TMP69:%.*]] = extractelement <4 x i1> [[TMP11]], i32 3
-; UNROLL-NO-IC-NEXT: br i1 [[TMP69]], label [[PRED_STORE_IF30:%.*]], label [[PRED_STORE_CONTINUE31]]
+; UNROLL-NO-IC-NEXT: br i1 [[TMP69]], label [[PRED_STORE_IF34:%.*]], label [[PRED_STORE_CONTINUE35]]
; UNROLL-NO-IC: pred.store.if30:
; UNROLL-NO-IC-NEXT: [[TMP70:%.*]] = add i32 [[INDEX]], 7
; UNROLL-NO-IC-NEXT: [[TMP71:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[TMP70]]
; UNROLL-NO-IC-NEXT: store i32 [[TMP9]], i32* [[TMP71]], align 4
-; UNROLL-NO-IC-NEXT: br label [[PRED_STORE_CONTINUE31]]
+; UNROLL-NO-IC-NEXT: br label [[PRED_STORE_CONTINUE35]]
; UNROLL-NO-IC: pred.store.continue31:
; UNROLL-NO-IC-NEXT: [[TMP72:%.*]] = select <4 x i1> [[TMP10]], <4 x i32> [[TMP46]], <4 x i32> [[VEC_PHI]]
-; UNROLL-NO-IC-NEXT: [[TMP73:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP47]], <4 x i32> [[VEC_PHI3]]
+; UNROLL-NO-IC-NEXT: [[TMP73:%.*]] = select <4 x i1> [[TMP11]], <4 x i32> [[TMP47]], <4 x i32> [[VEC_PHI7]]
; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8
-; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], <i32 4, i32 4, i32 4, i32 4>
+; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT6]] = add <4 x i32> [[STEP_ADD4]], <i32 4, i32 4, i32 4, i32 4>
; UNROLL-NO-IC-NEXT: [[TMP74:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; UNROLL-NO-IC-NEXT: br i1 [[TMP74]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF52]], !llvm.loop [[LOOP56:![0-9]+]]
; UNROLL-NO-IC: middle.block:
; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE10:%.*]] ]
; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[PRED_STORE_CONTINUE10]] ]
; UNROLL-NO-VF-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[PRED_STORE_CONTINUE10]] ]
-; UNROLL-NO-VF-NEXT: [[VEC_PHI2:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[PRED_STORE_CONTINUE10]] ]
+; UNROLL-NO-VF-NEXT: [[VEC_PHI5:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[PRED_STORE_CONTINUE10]] ]
; UNROLL-NO-VF-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[Y]], [[INDEX]]
-; UNROLL-NO-VF-NEXT: [[INDUCTION4:%.*]] = add i32 [[OFFSET_IDX]], 0
-; UNROLL-NO-VF-NEXT: [[INDUCTION5:%.*]] = add i32 [[OFFSET_IDX]], -1
+; UNROLL-NO-VF-NEXT: [[INDUCTION:%.*]] = add i32 [[OFFSET_IDX]], 0
+; UNROLL-NO-VF-NEXT: [[INDUCTION2:%.*]] = add i32 [[OFFSET_IDX]], -1
; UNROLL-NO-VF-NEXT: [[VEC_IV:%.*]] = add i32 [[INDEX]], 0
; UNROLL-NO-VF-NEXT: [[VEC_IV6:%.*]] = add i32 [[INDEX]], 1
; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = icmp ule i32 [[VEC_IV]], [[TRIP_COUNT_MINUS_1]]
; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = icmp ule i32 [[VEC_IV6]], [[TRIP_COUNT_MINUS_1]]
; UNROLL-NO-VF-NEXT: br i1 [[TMP2]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
; UNROLL-NO-VF: pred.udiv.if:
-; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = udiv i32 219220132, [[INDUCTION4]]
+; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = udiv i32 219220132, [[INDUCTION]]
; UNROLL-NO-VF-NEXT: br label [[PRED_UDIV_CONTINUE]]
; UNROLL-NO-VF: pred.udiv.continue:
; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP4]], [[PRED_UDIV_IF]] ]
; UNROLL-NO-VF-NEXT: br i1 [[TMP3]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8:%.*]]
; UNROLL-NO-VF: pred.udiv.if7:
-; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = udiv i32 219220132, [[INDUCTION5]]
+; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = udiv i32 219220132, [[INDUCTION2]]
; UNROLL-NO-VF-NEXT: br label [[PRED_UDIV_CONTINUE8]]
; UNROLL-NO-VF: pred.udiv.continue8:
; UNROLL-NO-VF-NEXT: [[TMP7]] = phi i32 [ poison, [[PRED_UDIV_CONTINUE]] ], [ [[TMP6]], [[PRED_UDIV_IF7]] ]
; UNROLL-NO-VF-NEXT: [[TMP8]] = add i32 [[VEC_PHI]], [[VECTOR_RECUR]]
-; UNROLL-NO-VF-NEXT: [[TMP9]] = add i32 [[VEC_PHI2]], [[TMP5]]
+; UNROLL-NO-VF-NEXT: [[TMP9]] = add i32 [[VEC_PHI5]], [[TMP5]]
; UNROLL-NO-VF-NEXT: br i1 [[TMP2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
; UNROLL-NO-VF: pred.store.if:
-; UNROLL-NO-VF-NEXT: [[INDUCTION:%.*]] = add i32 [[INDEX]], 0
-; UNROLL-NO-VF-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[INDUCTION]]
-; UNROLL-NO-VF-NEXT: store i32 [[INDUCTION4]], i32* [[TMP10]], align 4
+; UNROLL-NO-VF-NEXT: [[INDUCTION3:%.*]] = add i32 [[INDEX]], 0
+; UNROLL-NO-VF-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[X:%.*]], i32 [[INDUCTION3]]
+; UNROLL-NO-VF-NEXT: store i32 [[INDUCTION]], i32* [[TMP10]], align 4
; UNROLL-NO-VF-NEXT: br label [[PRED_STORE_CONTINUE]]
; UNROLL-NO-VF: pred.store.continue:
; UNROLL-NO-VF-NEXT: br i1 [[TMP3]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10]]
; UNROLL-NO-VF: pred.store.if9:
-; UNROLL-NO-VF-NEXT: [[INDUCTION3:%.*]] = add i32 [[INDEX]], 1
-; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[INDUCTION3]]
-; UNROLL-NO-VF-NEXT: store i32 [[INDUCTION5]], i32* [[TMP11]], align 4
+; UNROLL-NO-VF-NEXT: [[INDUCTION4:%.*]] = add i32 [[INDEX]], 1
+; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[INDUCTION4]]
+; UNROLL-NO-VF-NEXT: store i32 [[INDUCTION2]], i32* [[TMP11]], align 4
; UNROLL-NO-VF-NEXT: br label [[PRED_STORE_CONTINUE10]]
; UNROLL-NO-VF: pred.store.continue10:
; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = select i1 [[TMP2]], i32 [[TMP8]], i32 [[VEC_PHI]]
-; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = select i1 [[TMP3]], i32 [[TMP9]], i32 [[VEC_PHI2]]
+; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = select i1 [[TMP3]], i32 [[TMP9]], i32 [[VEC_PHI5]]
; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2
; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; UNROLL-NO-VF-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF51]], !llvm.loop [[LOOP55:![0-9]+]]
; SINK-AFTER-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
; SINK-AFTER-NEXT: br label [[VECTOR_BODY:%.*]]
; SINK-AFTER: vector.body:
-; SINK-AFTER-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE13:%.*]] ]
-; SINK-AFTER-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE13]] ]
-; SINK-AFTER-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[PRED_STORE_CONTINUE13]] ]
-; SINK-AFTER-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[PRED_STORE_CONTINUE13]] ]
+; SINK-AFTER-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE15:%.*]] ]
+; SINK-AFTER-NEXT: [[VEC_IND2:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[PRED_STORE_CONTINUE15]] ]
+; SINK-AFTER-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[PRED_STORE_CONTINUE15]] ]
+; SINK-AFTER-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[PRED_STORE_CONTINUE15]] ]
; SINK-AFTER-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[Y]], [[INDEX]]
; SINK-AFTER-NEXT: [[TMP2:%.*]] = add i32 [[OFFSET_IDX]], 0
; SINK-AFTER-NEXT: [[TMP3:%.*]] = add i32 [[OFFSET_IDX]], -1
; SINK-AFTER-NEXT: [[TMP4:%.*]] = add i32 [[OFFSET_IDX]], -2
; SINK-AFTER-NEXT: [[TMP5:%.*]] = add i32 [[OFFSET_IDX]], -3
-; SINK-AFTER-NEXT: [[TMP6:%.*]] = icmp ule <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; SINK-AFTER-NEXT: [[TMP6:%.*]] = icmp ule <4 x i32> [[VEC_IND2]], [[BROADCAST_SPLAT]]
; SINK-AFTER-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP6]], i32 0
; SINK-AFTER-NEXT: br i1 [[TMP7]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
; SINK-AFTER: pred.udiv.if:
; SINK-AFTER: pred.udiv.continue:
; SINK-AFTER-NEXT: [[TMP10:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP9]], [[PRED_UDIV_IF]] ]
; SINK-AFTER-NEXT: [[TMP11:%.*]] = extractelement <4 x i1> [[TMP6]], i32 1
-; SINK-AFTER-NEXT: br i1 [[TMP11]], label [[PRED_UDIV_IF2:%.*]], label [[PRED_UDIV_CONTINUE3:%.*]]
+; SINK-AFTER-NEXT: br i1 [[TMP11]], label [[PRED_UDIV_IF4:%.*]], label [[PRED_UDIV_CONTINUE5:%.*]]
; SINK-AFTER: pred.udiv.if2:
; SINK-AFTER-NEXT: [[TMP12:%.*]] = udiv i32 219220132, [[TMP3]]
; SINK-AFTER-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP12]], i32 1
-; SINK-AFTER-NEXT: br label [[PRED_UDIV_CONTINUE3]]
+; SINK-AFTER-NEXT: br label [[PRED_UDIV_CONTINUE5]]
; SINK-AFTER: pred.udiv.continue3:
-; SINK-AFTER-NEXT: [[TMP14:%.*]] = phi <4 x i32> [ [[TMP10]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP13]], [[PRED_UDIV_IF2]] ]
+; SINK-AFTER-NEXT: [[TMP14:%.*]] = phi <4 x i32> [ [[TMP10]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP13]], [[PRED_UDIV_IF4]] ]
; SINK-AFTER-NEXT: [[TMP15:%.*]] = extractelement <4 x i1> [[TMP6]], i32 2
-; SINK-AFTER-NEXT: br i1 [[TMP15]], label [[PRED_UDIV_IF4:%.*]], label [[PRED_UDIV_CONTINUE5:%.*]]
+; SINK-AFTER-NEXT: br i1 [[TMP15]], label [[PRED_UDIV_IF6:%.*]], label [[PRED_UDIV_CONTINUE7:%.*]]
; SINK-AFTER: pred.udiv.if4:
; SINK-AFTER-NEXT: [[TMP16:%.*]] = udiv i32 219220132, [[TMP4]]
; SINK-AFTER-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP16]], i32 2
-; SINK-AFTER-NEXT: br label [[PRED_UDIV_CONTINUE5]]
+; SINK-AFTER-NEXT: br label [[PRED_UDIV_CONTINUE7]]
; SINK-AFTER: pred.udiv.continue5:
-; SINK-AFTER-NEXT: [[TMP18:%.*]] = phi <4 x i32> [ [[TMP14]], [[PRED_UDIV_CONTINUE3]] ], [ [[TMP17]], [[PRED_UDIV_IF4]] ]
+; SINK-AFTER-NEXT: [[TMP18:%.*]] = phi <4 x i32> [ [[TMP14]], [[PRED_UDIV_CONTINUE5]] ], [ [[TMP17]], [[PRED_UDIV_IF6]] ]
; SINK-AFTER-NEXT: [[TMP19:%.*]] = extractelement <4 x i1> [[TMP6]], i32 3
-; SINK-AFTER-NEXT: br i1 [[TMP19]], label [[PRED_UDIV_IF6:%.*]], label [[PRED_UDIV_CONTINUE7:%.*]]
+; SINK-AFTER-NEXT: br i1 [[TMP19]], label [[PRED_UDIV_IF8:%.*]], label [[PRED_UDIV_CONTINUE9:%.*]]
; SINK-AFTER: pred.udiv.if6:
; SINK-AFTER-NEXT: [[TMP20:%.*]] = udiv i32 219220132, [[TMP5]]
; SINK-AFTER-NEXT: [[TMP21:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[TMP20]], i32 3
-; SINK-AFTER-NEXT: br label [[PRED_UDIV_CONTINUE7]]
+; SINK-AFTER-NEXT: br label [[PRED_UDIV_CONTINUE9]]
; SINK-AFTER: pred.udiv.continue7:
-; SINK-AFTER-NEXT: [[TMP22]] = phi <4 x i32> [ [[TMP18]], [[PRED_UDIV_CONTINUE5]] ], [ [[TMP21]], [[PRED_UDIV_IF6]] ]
+; SINK-AFTER-NEXT: [[TMP22]] = phi <4 x i32> [ [[TMP18]], [[PRED_UDIV_CONTINUE7]] ], [ [[TMP21]], [[PRED_UDIV_IF8]] ]
; SINK-AFTER-NEXT: [[TMP23:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP22]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; SINK-AFTER-NEXT: [[TMP24]] = add <4 x i32> [[VEC_PHI]], [[TMP23]]
; SINK-AFTER-NEXT: [[TMP25:%.*]] = extractelement <4 x i1> [[TMP6]], i32 0
; SINK-AFTER-NEXT: br label [[PRED_STORE_CONTINUE]]
; SINK-AFTER: pred.store.continue:
; SINK-AFTER-NEXT: [[TMP28:%.*]] = extractelement <4 x i1> [[TMP6]], i32 1
-; SINK-AFTER-NEXT: br i1 [[TMP28]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9:%.*]]
+; SINK-AFTER-NEXT: br i1 [[TMP28]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11:%.*]]
; SINK-AFTER: pred.store.if8:
; SINK-AFTER-NEXT: [[TMP29:%.*]] = add i32 [[INDEX]], 1
; SINK-AFTER-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[TMP29]]
; SINK-AFTER-NEXT: store i32 [[TMP3]], i32* [[TMP30]], align 4
-; SINK-AFTER-NEXT: br label [[PRED_STORE_CONTINUE9]]
+; SINK-AFTER-NEXT: br label [[PRED_STORE_CONTINUE11]]
; SINK-AFTER: pred.store.continue9:
; SINK-AFTER-NEXT: [[TMP31:%.*]] = extractelement <4 x i1> [[TMP6]], i32 2
-; SINK-AFTER-NEXT: br i1 [[TMP31]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11:%.*]]
+; SINK-AFTER-NEXT: br i1 [[TMP31]], label [[PRED_STORE_IF12:%.*]], label [[PRED_STORE_CONTINUE13:%.*]]
; SINK-AFTER: pred.store.if10:
; SINK-AFTER-NEXT: [[TMP32:%.*]] = add i32 [[INDEX]], 2
; SINK-AFTER-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[TMP32]]
; SINK-AFTER-NEXT: store i32 [[TMP4]], i32* [[TMP33]], align 4
-; SINK-AFTER-NEXT: br label [[PRED_STORE_CONTINUE11]]
+; SINK-AFTER-NEXT: br label [[PRED_STORE_CONTINUE13]]
; SINK-AFTER: pred.store.continue11:
; SINK-AFTER-NEXT: [[TMP34:%.*]] = extractelement <4 x i1> [[TMP6]], i32 3
-; SINK-AFTER-NEXT: br i1 [[TMP34]], label [[PRED_STORE_IF12:%.*]], label [[PRED_STORE_CONTINUE13]]
+; SINK-AFTER-NEXT: br i1 [[TMP34]], label [[PRED_STORE_IF14:%.*]], label [[PRED_STORE_CONTINUE15]]
; SINK-AFTER: pred.store.if12:
; SINK-AFTER-NEXT: [[TMP35:%.*]] = add i32 [[INDEX]], 3
; SINK-AFTER-NEXT: [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[X]], i32 [[TMP35]]
; SINK-AFTER-NEXT: store i32 [[TMP5]], i32* [[TMP36]], align 4
-; SINK-AFTER-NEXT: br label [[PRED_STORE_CONTINUE13]]
+; SINK-AFTER-NEXT: br label [[PRED_STORE_CONTINUE15]]
; SINK-AFTER: pred.store.continue13:
; SINK-AFTER-NEXT: [[TMP37:%.*]] = select <4 x i1> [[TMP6]], <4 x i32> [[TMP24]], <4 x i32> [[VEC_PHI]]
; SINK-AFTER-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
-; SINK-AFTER-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; SINK-AFTER-NEXT: [[VEC_IND_NEXT3]] = add <4 x i32> [[VEC_IND2]], <i32 4, i32 4, i32 4, i32 4>
; SINK-AFTER-NEXT: [[TMP38:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; SINK-AFTER-NEXT: br i1 [[TMP38]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF52]], !llvm.loop [[LOOP56:![0-9]+]]
; SINK-AFTER: middle.block:
; UNROLL-NO-IC: vector.body:
; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; UNROLL-NO-IC-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
; UNROLL-NO-IC-NEXT: [[STEP_ADD:%.*]] = add <4 x i16> [[VEC_IND]], <i16 4, i16 4, i16 4, i16 4>
; UNROLL-NO-IC-NEXT: [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16
; UNROLL-NO-IC-NEXT: [[TMP0:%.*]] = add i16 [[OFFSET_IDX]], 0
-; UNROLL-NO-IC-NEXT: [[TMP1:%.*]] = add i16 [[OFFSET_IDX]], 4
-; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = add <4 x i16> [[VEC_IND]], <i16 1, i16 1, i16 1, i16 1>
-; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = add <4 x i16> [[STEP_ADD]], <i16 1, i16 1, i16 1, i16 1>
-; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = or <4 x i16> [[TMP2]], [[TMP2]]
-; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = or <4 x i16> [[TMP3]], [[TMP3]]
-; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = zext <4 x i16> [[TMP4]] to <4 x i32>
-; UNROLL-NO-IC-NEXT: [[TMP7]] = zext <4 x i16> [[TMP5]] to <4 x i32>
-; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP6]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[A_PTR:%.*]], i16 [[TMP0]]
-; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[A_PTR]], i16 [[TMP1]]
-; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP10]], i32 0
-; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>*
-; UNROLL-NO-IC-NEXT: store <4 x i32> zeroinitializer, <4 x i32>* [[TMP13]], align 4
-; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = getelementptr i32, i32* [[TMP10]], i32 4
-; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <4 x i32>*
-; UNROLL-NO-IC-NEXT: store <4 x i32> zeroinitializer, <4 x i32>* [[TMP15]], align 4
+; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = add i16 [[OFFSET_IDX]], 4
+; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = add <4 x i16> [[VEC_IND]], <i16 1, i16 1, i16 1, i16 1>
+; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = add <4 x i16> [[STEP_ADD]], <i16 1, i16 1, i16 1, i16 1>
+; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = or <4 x i16> [[TMP8]], [[TMP8]]
+; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = or <4 x i16> [[TMP9]], [[TMP9]]
+; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = zext <4 x i16> [[TMP10]] to <4 x i32>
+; UNROLL-NO-IC-NEXT: [[TMP13]] = zext <4 x i16> [[TMP11]] to <4 x i32>
+; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP12]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> [[TMP13]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL-NO-IC-NEXT: [[TMP24:%.*]] = getelementptr i32, i32* [[A_PTR:%.*]], i16 [[TMP0]]
+; UNROLL-NO-IC-NEXT: [[TMP25:%.*]] = getelementptr i32, i32* [[A_PTR]], i16 [[TMP4]]
+; UNROLL-NO-IC-NEXT: [[TMP26:%.*]] = getelementptr i32, i32* [[TMP24]], i32 0
+; UNROLL-NO-IC-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <4 x i32>*
+; UNROLL-NO-IC-NEXT: store <4 x i32> zeroinitializer, <4 x i32>* [[TMP27]], align 4
+; UNROLL-NO-IC-NEXT: [[TMP28:%.*]] = getelementptr i32, i32* [[TMP24]], i32 4
+; UNROLL-NO-IC-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
+; UNROLL-NO-IC-NEXT: store <4 x i32> zeroinitializer, <4 x i32>* [[TMP29]], align 4
; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], <i16 4, i16 4, i16 4, i16 4>
-; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
-; UNROLL-NO-IC-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP58:![0-9]+]]
+; UNROLL-NO-IC-NEXT: [[TMP30:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
+; UNROLL-NO-IC-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP58:![0-9]+]]
; UNROLL-NO-IC: middle.block:
; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i32 16, 16
-; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP7]], i32 3
-; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP7]], i32 2
+; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP13]], i32 3
+; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP13]], i32 2
; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
; UNROLL-NO-IC: scalar.ph:
; UNROLL-NO-IC-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = or i16 [[TMP1]], [[TMP1]]
; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = zext i16 [[TMP2]] to i32
; UNROLL-NO-VF-NEXT: [[TMP5]] = zext i16 [[TMP3]] to i32
-; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[A_PTR:%.*]], i16 [[INDUCTION]]
-; UNROLL-NO-VF-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[A_PTR]], i16 [[INDUCTION1]]
-; UNROLL-NO-VF-NEXT: store i32 0, i32* [[TMP6]], align 4
-; UNROLL-NO-VF-NEXT: store i32 0, i32* [[TMP7]], align 4
+; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = getelementptr i32, i32* [[A_PTR:%.*]], i16 [[INDUCTION]]
+; UNROLL-NO-VF-NEXT: [[TMP15:%.*]] = getelementptr i32, i32* [[A_PTR]], i16 [[INDUCTION1]]
+; UNROLL-NO-VF-NEXT: store i32 0, i32* [[TMP14]], align 4
+; UNROLL-NO-VF-NEXT: store i32 0, i32* [[TMP15]], align 4
; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; UNROLL-NO-VF-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
-; UNROLL-NO-VF-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP57:![0-9]+]]
+; UNROLL-NO-VF-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
+; UNROLL-NO-VF-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP57:![0-9]+]]
; UNROLL-NO-VF: middle.block:
; UNROLL-NO-VF-NEXT: [[CMP_N:%.*]] = icmp eq i32 16, 16
; UNROLL-NO-VF-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
; SINK-AFTER: vector.body:
; SINK-AFTER-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; SINK-AFTER-NEXT: [[VEC_IND:%.*]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SINK-AFTER-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
+; SINK-AFTER-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ <i32 poison, i32 poison, i32 poison, i32 0>, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
; SINK-AFTER-NEXT: [[OFFSET_IDX:%.*]] = trunc i32 [[INDEX]] to i16
; SINK-AFTER-NEXT: [[TMP0:%.*]] = add i16 [[OFFSET_IDX]], 0
-; SINK-AFTER-NEXT: [[TMP1:%.*]] = add <4 x i16> [[VEC_IND]], <i16 1, i16 1, i16 1, i16 1>
-; SINK-AFTER-NEXT: [[TMP2:%.*]] = or <4 x i16> [[TMP1]], [[TMP1]]
-; SINK-AFTER-NEXT: [[TMP3]] = zext <4 x i16> [[TMP2]] to <4 x i32>
-; SINK-AFTER-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP3]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; SINK-AFTER-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[A_PTR:%.*]], i16 [[TMP0]]
-; SINK-AFTER-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP5]], i32 0
-; SINK-AFTER-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; SINK-AFTER-NEXT: store <4 x i32> zeroinitializer, <4 x i32>* [[TMP7]], align 4
+; SINK-AFTER-NEXT: [[TMP4:%.*]] = add <4 x i16> [[VEC_IND]], <i16 1, i16 1, i16 1, i16 1>
+; SINK-AFTER-NEXT: [[TMP5:%.*]] = or <4 x i16> [[TMP4]], [[TMP4]]
+; SINK-AFTER-NEXT: [[TMP6]] = zext <4 x i16> [[TMP5]] to <4 x i32>
+; SINK-AFTER-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP6]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; SINK-AFTER-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[A_PTR:%.*]], i16 [[TMP0]]
+; SINK-AFTER-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP12]], i32 0
+; SINK-AFTER-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
+; SINK-AFTER-NEXT: store <4 x i32> zeroinitializer, <4 x i32>* [[TMP14]], align 4
; SINK-AFTER-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; SINK-AFTER-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], <i16 4, i16 4, i16 4, i16 4>
-; SINK-AFTER-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
-; SINK-AFTER-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP58:![0-9]+]]
+; SINK-AFTER-NEXT: [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
+; SINK-AFTER-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP58:![0-9]+]]
; SINK-AFTER: middle.block:
; SINK-AFTER-NEXT: [[CMP_N:%.*]] = icmp eq i32 16, 16
-; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP3]], i32 3
-; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP3]], i32 2
+; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP6]], i32 3
+; SINK-AFTER-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i32> [[TMP6]], i32 2
; SINK-AFTER-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
; SINK-AFTER: scalar.ph:
; SINK-AFTER-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
; SINK-AFTER-NEXT: br i1 [[VEC_DEAD]], label [[FOR_END]], label [[LOOP]], !llvm.loop [[LOOP59:![0-9]+]]
; SINK-AFTER: for.end:
; SINK-AFTER-NEXT: ret void
-;
entry:
br label %loop
; VEC4_INTERL2-NEXT: [[CAST_CRD:%.*]] = sitofp i64 [[N_VEC]] to float
; VEC4_INTERL2-NEXT: br label [[VECTOR_BODY:%.*]]
; VEC4_INTERL2: vector.body:
-; VEC4_INTERL2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE16:%.*]] ]
+; VEC4_INTERL2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE17:%.*]] ]
; VEC4_INTERL2-NEXT: [[TMP0:%.*]] = sitofp i64 [[INDEX]] to float
; VEC4_INTERL2-NEXT: [[TMP1:%.*]] = or i64 [[INDEX]], 4
; VEC4_INTERL2-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]]
; VEC4_INTERL2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
; VEC4_INTERL2-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP2]], i64 4
; VEC4_INTERL2-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; VEC4_INTERL2-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4
+; VEC4_INTERL2-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4
; VEC4_INTERL2-NEXT: [[TMP6:%.*]] = fcmp fast oeq <4 x float> [[WIDE_LOAD]], zeroinitializer
-; VEC4_INTERL2-NEXT: [[TMP7:%.*]] = fcmp fast oeq <4 x float> [[WIDE_LOAD2]], zeroinitializer
+; VEC4_INTERL2-NEXT: [[TMP7:%.*]] = fcmp fast oeq <4 x float> [[WIDE_LOAD3]], zeroinitializer
; VEC4_INTERL2-NEXT: [[TMP8:%.*]] = extractelement <4 x i1> [[TMP6]], i64 0
; VEC4_INTERL2-NEXT: br i1 [[TMP8]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
; VEC4_INTERL2: pred.store.if:
; VEC4_INTERL2-NEXT: br label [[PRED_STORE_CONTINUE]]
; VEC4_INTERL2: pred.store.continue:
; VEC4_INTERL2-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP6]], i64 1
-; VEC4_INTERL2-NEXT: br i1 [[TMP10]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]]
+; VEC4_INTERL2-NEXT: br i1 [[TMP10]], label [[PRED_STORE_IF4:%.*]], label [[PRED_STORE_CONTINUE5:%.*]]
; VEC4_INTERL2: pred.store.if3:
; VEC4_INTERL2-NEXT: [[TMP11:%.*]] = fadd fast float [[TMP0]], 1.000000e+00
; VEC4_INTERL2-NEXT: [[TMP12:%.*]] = or i64 [[INDEX]], 1
; VEC4_INTERL2-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP12]]
; VEC4_INTERL2-NEXT: store float [[TMP11]], float* [[TMP13]], align 4
-; VEC4_INTERL2-NEXT: br label [[PRED_STORE_CONTINUE4]]
+; VEC4_INTERL2-NEXT: br label [[PRED_STORE_CONTINUE5]]
; VEC4_INTERL2: pred.store.continue4:
; VEC4_INTERL2-NEXT: [[TMP14:%.*]] = extractelement <4 x i1> [[TMP6]], i64 2
-; VEC4_INTERL2-NEXT: br i1 [[TMP14]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]]
+; VEC4_INTERL2-NEXT: br i1 [[TMP14]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7:%.*]]
; VEC4_INTERL2: pred.store.if5:
; VEC4_INTERL2-NEXT: [[TMP15:%.*]] = fadd fast float [[TMP0]], 2.000000e+00
; VEC4_INTERL2-NEXT: [[TMP16:%.*]] = or i64 [[INDEX]], 2
; VEC4_INTERL2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP16]]
; VEC4_INTERL2-NEXT: store float [[TMP15]], float* [[TMP17]], align 4
-; VEC4_INTERL2-NEXT: br label [[PRED_STORE_CONTINUE6]]
+; VEC4_INTERL2-NEXT: br label [[PRED_STORE_CONTINUE7]]
; VEC4_INTERL2: pred.store.continue6:
; VEC4_INTERL2-NEXT: [[TMP18:%.*]] = extractelement <4 x i1> [[TMP6]], i64 3
-; VEC4_INTERL2-NEXT: br i1 [[TMP18]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]]
+; VEC4_INTERL2-NEXT: br i1 [[TMP18]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9:%.*]]
; VEC4_INTERL2: pred.store.if7:
; VEC4_INTERL2-NEXT: [[TMP19:%.*]] = fadd fast float [[TMP0]], 3.000000e+00
; VEC4_INTERL2-NEXT: [[TMP20:%.*]] = or i64 [[INDEX]], 3
; VEC4_INTERL2-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP20]]
; VEC4_INTERL2-NEXT: store float [[TMP19]], float* [[TMP21]], align 4
-; VEC4_INTERL2-NEXT: br label [[PRED_STORE_CONTINUE8]]
+; VEC4_INTERL2-NEXT: br label [[PRED_STORE_CONTINUE9]]
; VEC4_INTERL2: pred.store.continue8:
; VEC4_INTERL2-NEXT: [[TMP22:%.*]] = extractelement <4 x i1> [[TMP7]], i64 0
-; VEC4_INTERL2-NEXT: br i1 [[TMP22]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
+; VEC4_INTERL2-NEXT: br i1 [[TMP22]], label [[PRED_STORE_IF10:%.*]], label [[PRED_STORE_CONTINUE11:%.*]]
; VEC4_INTERL2: pred.store.if9:
; VEC4_INTERL2-NEXT: [[TMP23:%.*]] = fadd fast float [[TMP0]], 4.000000e+00
; VEC4_INTERL2-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]]
; VEC4_INTERL2-NEXT: store float [[TMP23]], float* [[TMP24]], align 4
-; VEC4_INTERL2-NEXT: br label [[PRED_STORE_CONTINUE10]]
+; VEC4_INTERL2-NEXT: br label [[PRED_STORE_CONTINUE11]]
; VEC4_INTERL2: pred.store.continue10:
; VEC4_INTERL2-NEXT: [[TMP25:%.*]] = extractelement <4 x i1> [[TMP7]], i64 1
-; VEC4_INTERL2-NEXT: br i1 [[TMP25]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12:%.*]]
+; VEC4_INTERL2-NEXT: br i1 [[TMP25]], label [[PRED_STORE_IF12:%.*]], label [[PRED_STORE_CONTINUE13:%.*]]
; VEC4_INTERL2: pred.store.if11:
; VEC4_INTERL2-NEXT: [[TMP26:%.*]] = fadd fast float [[TMP0]], 5.000000e+00
; VEC4_INTERL2-NEXT: [[TMP27:%.*]] = or i64 [[INDEX]], 5
; VEC4_INTERL2-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP27]]
; VEC4_INTERL2-NEXT: store float [[TMP26]], float* [[TMP28]], align 4
-; VEC4_INTERL2-NEXT: br label [[PRED_STORE_CONTINUE12]]
+; VEC4_INTERL2-NEXT: br label [[PRED_STORE_CONTINUE13]]
; VEC4_INTERL2: pred.store.continue12:
; VEC4_INTERL2-NEXT: [[TMP29:%.*]] = extractelement <4 x i1> [[TMP7]], i64 2
-; VEC4_INTERL2-NEXT: br i1 [[TMP29]], label [[PRED_STORE_IF13:%.*]], label [[PRED_STORE_CONTINUE14:%.*]]
+; VEC4_INTERL2-NEXT: br i1 [[TMP29]], label [[PRED_STORE_IF14:%.*]], label [[PRED_STORE_CONTINUE15:%.*]]
; VEC4_INTERL2: pred.store.if13:
; VEC4_INTERL2-NEXT: [[TMP30:%.*]] = fadd fast float [[TMP0]], 6.000000e+00
; VEC4_INTERL2-NEXT: [[TMP31:%.*]] = or i64 [[INDEX]], 6
; VEC4_INTERL2-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP31]]
; VEC4_INTERL2-NEXT: store float [[TMP30]], float* [[TMP32]], align 4
-; VEC4_INTERL2-NEXT: br label [[PRED_STORE_CONTINUE14]]
+; VEC4_INTERL2-NEXT: br label [[PRED_STORE_CONTINUE15]]
; VEC4_INTERL2: pred.store.continue14:
; VEC4_INTERL2-NEXT: [[TMP33:%.*]] = extractelement <4 x i1> [[TMP7]], i64 3
-; VEC4_INTERL2-NEXT: br i1 [[TMP33]], label [[PRED_STORE_IF15:%.*]], label [[PRED_STORE_CONTINUE16]]
+; VEC4_INTERL2-NEXT: br i1 [[TMP33]], label [[PRED_STORE_IF16:%.*]], label [[PRED_STORE_CONTINUE17]]
; VEC4_INTERL2: pred.store.if15:
; VEC4_INTERL2-NEXT: [[TMP34:%.*]] = fadd fast float [[TMP0]], 7.000000e+00
; VEC4_INTERL2-NEXT: [[TMP35:%.*]] = or i64 [[INDEX]], 7
; VEC4_INTERL2-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP35]]
; VEC4_INTERL2-NEXT: store float [[TMP34]], float* [[TMP36]], align 4
-; VEC4_INTERL2-NEXT: br label [[PRED_STORE_CONTINUE16]]
+; VEC4_INTERL2-NEXT: br label [[PRED_STORE_CONTINUE17]]
; VEC4_INTERL2: pred.store.continue16:
; VEC4_INTERL2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; VEC4_INTERL2-NEXT: [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: [[TMP5]] = select <4 x i1> [[TMP4]], <4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65536
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: [[TMP7:%.*]] = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[TMP5]])
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 65536, 65536
; CHECK-NEXT: [[T6]] = select i1 [[T5]], float [[T2]], float [[T4]]
; CHECK-NEXT: [[T7]] = add i64 [[T1]], 1
; CHECK-NEXT: [[T8:%.*]] = icmp eq i64 [[T7]], 65537
-; CHECK-NEXT: br i1 [[T8]], label [[OUT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT: br i1 [[T8]], label [[OUT]], label [[LOOP]], [[LOOP2:!llvm.loop !.*]]
; CHECK: out:
; CHECK-NEXT: [[T6_LCSSA:%.*]] = phi float [ [[T6]], [[LOOP]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret float [[T6_LCSSA]]
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s
define void @foo(float* %a, i64 %n) {
-; CHECK-LABEL: @foo(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4
+; CHECK: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* {{.*}}, align 4
; CHECK-NEXT: [[TMP4:%.*]] = fneg <4 x float> [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP2]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[SUB:%.*]] = fneg float [[TMP7]]
-; CHECK-NEXT: store float [[SUB]], float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: for.exit:
-; CHECK-NEXT: ret void
+; CHECK: store <4 x float> [[TMP4]], <4 x float>* {{.*}}, align 4
;
entry:
br label %for.body
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S
; RUN: opt < %s -debugify -loop-vectorize -S | FileCheck %s --check-prefix=DEBUGLOC
define void @f() nounwind uwtable ssp {
; Check that the induction phis and adds have debug location.
;
-; DEBUGLOC-LABEL: @f(
-; DEBUGLOC-NEXT: scalar.ph:
-; DEBUGLOC-NEXT: store i8 0, i8* inttoptr (i64 1 to i8*), align 1, !dbg [[DBG22:![0-9]+]]
-; DEBUGLOC-NEXT: [[TMP0:%.*]] = load i8, i8* @a, align 1, !dbg [[DBG23:![0-9]+]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata i8 [[TMP0]], metadata [[META9:![0-9]+]], metadata !DIExpression()), !dbg [[DBG23]]
-; DEBUGLOC-NEXT: br i1 false, label [[SCALAR_PH1:%.*]], label [[VECTOR_PH:%.*]], !dbg [[DBG24:![0-9]+]]
-; DEBUGLOC: vector.ph:
-; DEBUGLOC-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i8> poison, i8 [[TMP0]], i32 0, !dbg [[DBG24]]
-; DEBUGLOC-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT]], <4 x i8> poison, <4 x i32> zeroinitializer, !dbg [[DBG24]]
-; DEBUGLOC-NEXT: br label [[VECTOR_BODY:%.*]], !dbg [[DBG24]]
-; DEBUGLOC: vector.body:
-; DEBUGLOC-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DEBUGLOC-NEXT: [[VEC_PHI:%.*]] = phi <4 x i8> [ <i8 0, i8 1, i8 1, i8 1>, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
-; DEBUGLOC-NEXT: [[VEC_IND:%.*]] = phi <4 x i8> [ undef, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ], !dbg [[DBG25:![0-9]+]]
-; DEBUGLOC-NEXT: [[TMP1:%.*]] = icmp ne <4 x i8> [[VEC_IND]], zeroinitializer, !dbg [[DBG26:![0-9]+]]
-; DEBUGLOC-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i8> [[VEC_IND]], <4 x i8> [[BROADCAST_SPLAT]], !dbg [[DBG27:![0-9]+]]
-; DEBUGLOC-NEXT: [[TMP3]] = mul <4 x i8> [[VEC_PHI]], [[TMP2]], !dbg [[DBG28:![0-9]+]]
-; DEBUGLOC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; DEBUGLOC-NEXT: [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], <i8 4, i8 4, i8 4, i8 4>, !dbg [[DBG25]]
-; DEBUGLOC-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
-; DEBUGLOC-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
-; DEBUGLOC: middle.block:
-; DEBUGLOC-NEXT: [[TMP5:%.*]] = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> [[TMP3]]), !dbg [[DBG31:![0-9]+]]
-; DEBUGLOC-NEXT: [[CMP_N:%.*]] = icmp eq i32 16, 16, !dbg [[DBG31]]
-; DEBUGLOC-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH1]], !dbg [[DBG31]]
-; DEBUGLOC: scalar.ph1:
-; DEBUGLOC-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ undef, [[MIDDLE_BLOCK]] ], [ undef, [[SCALAR_PH:%.*]] ], !dbg [[DBG25]]
-; DEBUGLOC-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ 0, [[SCALAR_PH]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
-; DEBUGLOC-NEXT: br label [[FOR_BODY:%.*]], !dbg [[DBG24]]
-; DEBUGLOC: for.body:
-; DEBUGLOC-NEXT: [[MUL16:%.*]] = phi i8 [ [[BC_MERGE_RDX]], [[SCALAR_PH1]] ], [ [[MUL:%.*]], [[FOR_BODY]] ], !dbg [[DBG32:![0-9]+]]
-; DEBUGLOC-NEXT: [[C_015:%.*]] = phi i8 [ [[BC_RESUME_VAL]], [[SCALAR_PH1]] ], [ [[CONV8:%.*]], [[FOR_BODY]] ], !dbg [[DBG25]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata i8 [[MUL16]], metadata [[META11:![0-9]+]], metadata !DIExpression()), !dbg [[DBG32]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata i8 [[C_015]], metadata [[META12:![0-9]+]], metadata !DIExpression()), !dbg [[DBG25]]
-; DEBUGLOC-NEXT: [[CONV2:%.*]] = sext i8 [[C_015]] to i32, !dbg [[DBG33:![0-9]+]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata i32 [[CONV2]], metadata [[META13:![0-9]+]], metadata !DIExpression()), !dbg [[DBG33]]
-; DEBUGLOC-NEXT: [[TOBOOL:%.*]] = icmp ne i8 [[C_015]], 0, !dbg [[DBG26]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata i1 [[TOBOOL]], metadata [[META15:![0-9]+]], metadata !DIExpression()), !dbg [[DBG26]]
-; DEBUGLOC-NEXT: [[DOTSINK:%.*]] = select i1 [[TOBOOL]], i8 [[C_015]], i8 [[TMP0]], !dbg [[DBG27]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata i8 [[DOTSINK]], metadata [[META16:![0-9]+]], metadata !DIExpression()), !dbg [[DBG27]]
-; DEBUGLOC-NEXT: [[MUL]] = mul i8 [[MUL16]], [[DOTSINK]], !dbg [[DBG28]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata i8 [[MUL]], metadata [[META17:![0-9]+]], metadata !DIExpression()), !dbg [[DBG28]]
-; DEBUGLOC-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV2]], 1, !dbg [[DBG34:![0-9]+]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata i32 [[ADD]], metadata [[META18:![0-9]+]], metadata !DIExpression()), !dbg [[DBG34]]
-; DEBUGLOC-NEXT: [[CONV8]] = trunc i32 [[ADD]] to i8, !dbg [[DBG35:![0-9]+]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata i8 [[CONV8]], metadata [[META19:![0-9]+]], metadata !DIExpression()), !dbg [[DBG35]]
-; DEBUGLOC-NEXT: [[SEXT:%.*]] = shl i32 [[ADD]], 24, !dbg [[DBG36:![0-9]+]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata i32 [[SEXT]], metadata [[META20:![0-9]+]], metadata !DIExpression()), !dbg [[DBG36]]
-; DEBUGLOC-NEXT: [[PHITMP14:%.*]] = icmp slt i32 [[SEXT]], 268435456, !dbg [[DBG37:![0-9]+]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata i1 [[PHITMP14]], metadata [[META21:![0-9]+]], metadata !DIExpression()), !dbg [[DBG37]]
-; DEBUGLOC-NEXT: br i1 [[PHITMP14]], label [[FOR_BODY]], label [[FOR_END]], !dbg [[DBG31]], !llvm.loop [[LOOP38:![0-9]+]]
-; DEBUGLOC: for.end:
-; DEBUGLOC-NEXT: [[MUL_LCSSA:%.*]] = phi i8 [ [[MUL]], [[FOR_BODY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ], !dbg [[DBG28]]
-; DEBUGLOC-NEXT: store i8 [[MUL_LCSSA]], i8* @b, align 1, !dbg [[DBG40:![0-9]+]]
-; DEBUGLOC-NEXT: ret void, !dbg [[DBG41:![0-9]+]]
-;
+; DEBUGLOC-LABEL: vector.body:
+; DEBUGLOC: %vec.ind = phi {{.*}}, !dbg ![[DbgLoc:[0-9]+]]
+; DEBUGLOC: %vec.ind.next = add {{.*}}, !dbg ![[DbgLoc]]
scalar.ph:
store i8 0, i8* inttoptr (i64 1 to i8*), align 1
}
; Check that the location of the new phi comes from %c.015 = phi i8
+; DEBUGLOC: ![[DbgLoc]] = !DILocation(line: 5
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -enable-if-conversion
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
while.body.i: ; preds = %if.end.i, %while.body.i.preheader
switch i8 undef, label %if.end.i [
- i8 39, label %if.then.i
- i8 92, label %if.then.i
+ i8 39, label %if.then.i
+ i8 92, label %if.then.i
]
if.then.i: ; preds = %while.body.i, %while.body.i
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -loop-vectorize < %s | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
; CHECK-NOT: %6000000 =
define void @_Z3fn4i(i32 %p1) {
-; CHECK-LABEL: @_Z3fn4i(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP88:%.*]] = icmp sgt i32 [[P1:%.*]], 0
-; CHECK-NEXT: br i1 [[CMP88]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
-; CHECK: for.body.lr.ph:
-; CHECK-NEXT: [[TMP0:%.*]] = load i32*, i32** @b, align 8
-; CHECK-NEXT: [[TMP1:%.*]] = load i32*, i32** @a, align 8
-; CHECK-NEXT: [[TMP2:%.*]] = load i32*, i32** @c, align 8
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[_ZL3FN3II_EXIT58:%.*]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT: [[AND_I:%.*]] = and i32 [[TMP4]], 1
-; CHECK-NEXT: [[TOBOOL_I_I:%.*]] = icmp eq i32 [[AND_I]], 0
-; CHECK-NEXT: br i1 [[TOBOOL_I_I]], label [[IF_END_I:%.*]], label [[IF_THEN_I:%.*]]
-; CHECK: if.then.i:
-; CHECK-NEXT: [[AND_I_I:%.*]] = lshr i32 [[TMP3]], 2
-; CHECK-NEXT: [[AND_LOBIT_I_I:%.*]] = and i32 [[AND_I_I]], 1
-; CHECK-NEXT: [[TMP5:%.*]] = xor i32 [[AND_LOBIT_I_I]], 1
-; CHECK-NEXT: [[OR_I_I:%.*]] = or i32 [[TMP5]], [[TMP3]]
-; CHECK-NEXT: [[CMP_I:%.*]] = icmp sgt i32 [[OR_I_I]], 0
-; CHECK-NEXT: [[CONV_I:%.*]] = zext i1 [[CMP_I]] to i32
-; CHECK-NEXT: br label [[IF_END_I]]
-; CHECK: if.end.i:
-; CHECK-NEXT: [[TOBOOL_I87:%.*]] = phi i1 [ true, [[IF_THEN_I]] ], [ false, [[FOR_BODY]] ]
-; CHECK-NEXT: [[P1_ADDR_0_I:%.*]] = phi i32 [ [[CONV_I]], [[IF_THEN_I]] ], [ [[TMP3]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP6:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT: [[AND1_I:%.*]] = and i32 [[TMP6]], 7
-; CHECK-NEXT: [[TOBOOL2_I:%.*]] = icmp eq i32 [[AND1_I]], 0
-; CHECK-NEXT: br i1 [[TOBOOL2_I]], label [[IF_END7_I:%.*]], label [[IF_THEN3_I:%.*]]
-; CHECK: if.then3.i:
-; CHECK-NEXT: [[P1_ADDR_0_LOBIT_I:%.*]] = lshr i32 [[P1_ADDR_0_I]], 31
-; CHECK-NEXT: [[AND6_I:%.*]] = and i32 [[P1_ADDR_0_I]], 1
-; CHECK-NEXT: [[OR_I:%.*]] = or i32 [[P1_ADDR_0_LOBIT_I]], [[AND6_I]]
-; CHECK-NEXT: br label [[IF_END7_I]]
-; CHECK: if.end7.i:
-; CHECK-NEXT: [[P1_ADDR_1_I:%.*]] = phi i32 [ [[OR_I]], [[IF_THEN3_I]] ], [ [[P1_ADDR_0_I]], [[IF_END_I]] ]
-; CHECK-NEXT: br i1 [[TOBOOL_I87]], label [[IF_THEN10_I:%.*]], label [[IF_END13_I:%.*]]
-; CHECK: if.then10.i:
-; CHECK-NEXT: [[CMP11_I:%.*]] = icmp sgt i32 [[P1_ADDR_1_I]], 0
-; CHECK-NEXT: [[CONV12_I:%.*]] = zext i1 [[CMP11_I]] to i32
-; CHECK-NEXT: br label [[IF_END13_I]]
-; CHECK: if.end13.i:
-; CHECK-NEXT: [[P1_ADDR_2_I:%.*]] = phi i32 [ [[CONV12_I]], [[IF_THEN10_I]] ], [ [[P1_ADDR_1_I]], [[IF_END7_I]] ]
-; CHECK-NEXT: br i1 [[TOBOOL_I_I]], label [[_Z3FN2III_EXIT:%.*]], label [[IF_THEN16_I:%.*]]
-; CHECK: if.then16.i:
-; CHECK-NEXT: [[AND17_I:%.*]] = lshr i32 [[P1_ADDR_2_I]], 3
-; CHECK-NEXT: [[AND17_LOBIT_I:%.*]] = and i32 [[AND17_I]], 1
-; CHECK-NEXT: br label [[_Z3FN2III_EXIT]]
-; CHECK: _Z3fn2iii.exit:
-; CHECK-NEXT: [[P1_ADDR_3_I:%.*]] = phi i32 [ [[AND17_LOBIT_I]], [[IF_THEN16_I]] ], [ [[P1_ADDR_2_I]], [[IF_END13_I]] ]
-; CHECK-NEXT: [[TMP7:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT: [[SHR_I:%.*]] = ashr i32 [[TMP7]], 1
-; CHECK-NEXT: [[AND_I18_I:%.*]] = and i32 [[SHR_I]], 1
-; CHECK-NEXT: [[TOBOOL_I19_I:%.*]] = icmp ne i32 [[AND_I18_I]], 0
-; CHECK-NEXT: br i1 [[TOBOOL_I19_I]], label [[IF_THEN_I20_I:%.*]], label [[IF_END_I_I:%.*]]
-; CHECK: if.then.i20.i:
-; CHECK-NEXT: [[CMP_I_I:%.*]] = icmp sgt i32 [[P1_ADDR_3_I]], 0
-; CHECK-NEXT: [[CONV_I_I:%.*]] = zext i1 [[CMP_I_I]] to i32
-; CHECK-NEXT: br label [[IF_END_I_I]]
-; CHECK: if.end.i.i:
-; CHECK-NEXT: [[P1_ADDR_0_I21_I:%.*]] = phi i32 [ [[CONV_I_I]], [[IF_THEN_I20_I]] ], [ [[P1_ADDR_3_I]], [[_Z3FN2III_EXIT]] ]
-; CHECK-NEXT: [[AND1_I_I:%.*]] = and i32 [[SHR_I]], 7
-; CHECK-NEXT: [[TOBOOL2_I_I:%.*]] = icmp eq i32 [[AND1_I_I]], 0
-; CHECK-NEXT: br i1 [[TOBOOL2_I_I]], label [[IF_END7_I_I:%.*]], label [[IF_THEN3_I_I:%.*]]
-; CHECK: if.then3.i.i:
-; CHECK-NEXT: [[P1_ADDR_0_LOBIT_I_I:%.*]] = lshr i32 [[P1_ADDR_0_I21_I]], 31
-; CHECK-NEXT: [[AND6_I_I:%.*]] = and i32 [[P1_ADDR_0_I21_I]], 1
-; CHECK-NEXT: [[OR_I22_I:%.*]] = or i32 [[P1_ADDR_0_LOBIT_I_I]], [[AND6_I_I]]
-; CHECK-NEXT: br label [[IF_END7_I_I]]
-; CHECK: if.end7.i.i:
-; CHECK-NEXT: [[P1_ADDR_1_I_I:%.*]] = phi i32 [ [[OR_I22_I]], [[IF_THEN3_I_I]] ], [ [[P1_ADDR_0_I21_I]], [[IF_END_I_I]] ]
-; CHECK-NEXT: br i1 [[TOBOOL_I19_I]], label [[IF_THEN10_I_I:%.*]], label [[IF_END13_I_I:%.*]]
-; CHECK: if.then10.i.i:
-; CHECK-NEXT: [[CMP11_I_I:%.*]] = icmp sgt i32 [[P1_ADDR_1_I_I]], 0
-; CHECK-NEXT: [[CONV12_I_I:%.*]] = zext i1 [[CMP11_I_I]] to i32
-; CHECK-NEXT: br label [[IF_END13_I_I]]
-; CHECK: if.end13.i.i:
-; CHECK-NEXT: [[P1_ADDR_2_I_I:%.*]] = phi i32 [ [[CONV12_I_I]], [[IF_THEN10_I_I]] ], [ [[P1_ADDR_1_I_I]], [[IF_END7_I_I]] ]
-; CHECK-NEXT: [[AND14_I_I:%.*]] = and i32 [[SHR_I]], 5
-; CHECK-NEXT: [[TOBOOL15_I_I:%.*]] = icmp eq i32 [[AND14_I_I]], 0
-; CHECK-NEXT: br i1 [[TOBOOL15_I_I]], label [[_Z3FN2III_EXIT_I:%.*]], label [[IF_THEN16_I_I:%.*]]
-; CHECK: if.then16.i.i:
-; CHECK-NEXT: [[AND17_I_I:%.*]] = lshr i32 [[P1_ADDR_2_I_I]], 3
-; CHECK-NEXT: [[AND17_LOBIT_I_I:%.*]] = and i32 [[AND17_I_I]], 1
-; CHECK-NEXT: br label [[_Z3FN2III_EXIT_I]]
-; CHECK: _Z3fn2iii.exit.i:
-; CHECK-NEXT: [[P1_ADDR_3_I_I:%.*]] = phi i32 [ [[AND17_LOBIT_I_I]], [[IF_THEN16_I_I]] ], [ [[P1_ADDR_2_I_I]], [[IF_END13_I_I]] ]
-; CHECK-NEXT: [[TMP8:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; CHECK-NEXT: [[TOBOOL_I11_I:%.*]] = icmp eq i32 [[TMP8]], 0
-; CHECK-NEXT: br i1 [[TOBOOL_I11_I]], label [[_ZL3FN3II_EXIT:%.*]], label [[IF_THEN_I15_I:%.*]]
-; CHECK: if.then.i15.i:
-; CHECK-NEXT: [[AND_I12_I:%.*]] = lshr i32 [[P1_ADDR_3_I_I]], 2
-; CHECK-NEXT: [[AND_LOBIT_I13_I:%.*]] = and i32 [[AND_I12_I]], 1
-; CHECK-NEXT: [[TMP9:%.*]] = xor i32 [[AND_LOBIT_I13_I]], 1
-; CHECK-NEXT: [[OR_I14_I:%.*]] = or i32 [[TMP9]], [[P1_ADDR_3_I_I]]
-; CHECK-NEXT: br label [[_ZL3FN3II_EXIT]]
-; CHECK: _ZL3fn3ii.exit:
-; CHECK-NEXT: [[P1_ADDR_0_I16_I:%.*]] = phi i32 [ [[OR_I14_I]], [[IF_THEN_I15_I]] ], [ [[P1_ADDR_3_I_I]], [[_Z3FN2III_EXIT_I]] ]
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store i32 [[P1_ADDR_0_I16_I]], i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX4]], align 4
-; CHECK-NEXT: br i1 [[TOBOOL_I_I]], label [[_Z3FN1II_EXIT_I26:%.*]], label [[IF_THEN_I_I21:%.*]]
-; CHECK: if.then.i.i21:
-; CHECK-NEXT: [[AND_I_I18:%.*]] = lshr i32 [[TMP10]], 2
-; CHECK-NEXT: [[AND_LOBIT_I_I19:%.*]] = and i32 [[AND_I_I18]], 1
-; CHECK-NEXT: [[TMP11:%.*]] = xor i32 [[AND_LOBIT_I_I19]], 1
-; CHECK-NEXT: [[OR_I_I20:%.*]] = or i32 [[TMP11]], [[TMP10]]
-; CHECK-NEXT: br label [[_Z3FN1II_EXIT_I26]]
-; CHECK: _Z3fn1ii.exit.i26:
-; CHECK-NEXT: [[P1_ADDR_0_I_I22:%.*]] = phi i32 [ [[OR_I_I20]], [[IF_THEN_I_I21]] ], [ [[TMP10]], [[_ZL3FN3II_EXIT]] ]
-; CHECK-NEXT: br i1 [[TOBOOL_I87]], label [[IF_THEN_I63:%.*]], label [[IF_END_I67:%.*]]
-; CHECK: if.then.i63:
-; CHECK-NEXT: [[CMP_I61:%.*]] = icmp sgt i32 [[P1_ADDR_0_I_I22]], 0
-; CHECK-NEXT: [[CONV_I62:%.*]] = zext i1 [[CMP_I61]] to i32
-; CHECK-NEXT: br label [[IF_END_I67]]
-; CHECK: if.end.i67:
-; CHECK-NEXT: [[P1_ADDR_0_I64:%.*]] = phi i32 [ [[CONV_I62]], [[IF_THEN_I63]] ], [ [[P1_ADDR_0_I_I22]], [[_Z3FN1II_EXIT_I26]] ]
-; CHECK-NEXT: br i1 [[TOBOOL2_I]], label [[IF_END7_I73:%.*]], label [[IF_THEN3_I71:%.*]]
-; CHECK: if.then3.i71:
-; CHECK-NEXT: [[P1_ADDR_0_LOBIT_I68:%.*]] = lshr i32 [[P1_ADDR_0_I64]], 31
-; CHECK-NEXT: [[AND6_I69:%.*]] = and i32 [[P1_ADDR_0_I64]], 1
-; CHECK-NEXT: [[OR_I70:%.*]] = or i32 [[P1_ADDR_0_LOBIT_I68]], [[AND6_I69]]
-; CHECK-NEXT: br label [[IF_END7_I73]]
-; CHECK: if.end7.i73:
-; CHECK-NEXT: [[P1_ADDR_1_I72:%.*]] = phi i32 [ [[OR_I70]], [[IF_THEN3_I71]] ], [ [[P1_ADDR_0_I64]], [[IF_END_I67]] ]
-; CHECK-NEXT: br i1 [[TOBOOL_I87]], label [[IF_THEN10_I76:%.*]], label [[IF_END13_I80:%.*]]
-; CHECK: if.then10.i76:
-; CHECK-NEXT: [[CMP11_I74:%.*]] = icmp sgt i32 [[P1_ADDR_1_I72]], 0
-; CHECK-NEXT: [[CONV12_I75:%.*]] = zext i1 [[CMP11_I74]] to i32
-; CHECK-NEXT: br label [[IF_END13_I80]]
-; CHECK: if.end13.i80:
-; CHECK-NEXT: [[P1_ADDR_2_I77:%.*]] = phi i32 [ [[CONV12_I75]], [[IF_THEN10_I76]] ], [ [[P1_ADDR_1_I72]], [[IF_END7_I73]] ]
-; CHECK-NEXT: br i1 [[TOBOOL_I_I]], label [[_Z3FN2III_EXIT85:%.*]], label [[IF_THEN16_I83:%.*]]
-; CHECK: if.then16.i83:
-; CHECK-NEXT: [[AND17_I81:%.*]] = lshr i32 [[P1_ADDR_2_I77]], 3
-; CHECK-NEXT: [[AND17_LOBIT_I82:%.*]] = and i32 [[AND17_I81]], 1
-; CHECK-NEXT: br label [[_Z3FN2III_EXIT85]]
-; CHECK: _Z3fn2iii.exit85:
-; CHECK-NEXT: [[P1_ADDR_3_I84:%.*]] = phi i32 [ [[AND17_LOBIT_I82]], [[IF_THEN16_I83]] ], [ [[P1_ADDR_2_I77]], [[IF_END13_I80]] ]
-; CHECK-NEXT: br i1 [[TOBOOL_I19_I]], label [[IF_THEN_I20_I29:%.*]], label [[IF_END_I_I33:%.*]]
-; CHECK: if.then.i20.i29:
-; CHECK-NEXT: [[CMP_I_I27:%.*]] = icmp sgt i32 [[P1_ADDR_3_I84]], 0
-; CHECK-NEXT: [[CONV_I_I28:%.*]] = zext i1 [[CMP_I_I27]] to i32
-; CHECK-NEXT: br label [[IF_END_I_I33]]
-; CHECK: if.end.i.i33:
-; CHECK-NEXT: [[P1_ADDR_0_I21_I30:%.*]] = phi i32 [ [[CONV_I_I28]], [[IF_THEN_I20_I29]] ], [ [[P1_ADDR_3_I84]], [[_Z3FN2III_EXIT85]] ]
-; CHECK-NEXT: br i1 [[TOBOOL2_I_I]], label [[IF_END7_I_I39:%.*]], label [[IF_THEN3_I_I37:%.*]]
-; CHECK: if.then3.i.i37:
-; CHECK-NEXT: [[P1_ADDR_0_LOBIT_I_I34:%.*]] = lshr i32 [[P1_ADDR_0_I21_I30]], 31
-; CHECK-NEXT: [[AND6_I_I35:%.*]] = and i32 [[P1_ADDR_0_I21_I30]], 1
-; CHECK-NEXT: [[OR_I22_I36:%.*]] = or i32 [[P1_ADDR_0_LOBIT_I_I34]], [[AND6_I_I35]]
-; CHECK-NEXT: br label [[IF_END7_I_I39]]
-; CHECK: if.end7.i.i39:
-; CHECK-NEXT: [[P1_ADDR_1_I_I38:%.*]] = phi i32 [ [[OR_I22_I36]], [[IF_THEN3_I_I37]] ], [ [[P1_ADDR_0_I21_I30]], [[IF_END_I_I33]] ]
-; CHECK-NEXT: br i1 [[TOBOOL_I19_I]], label [[IF_THEN10_I_I42:%.*]], label [[IF_END13_I_I46:%.*]]
-; CHECK: if.then10.i.i42:
-; CHECK-NEXT: [[CMP11_I_I40:%.*]] = icmp sgt i32 [[P1_ADDR_1_I_I38]], 0
-; CHECK-NEXT: [[CONV12_I_I41:%.*]] = zext i1 [[CMP11_I_I40]] to i32
-; CHECK-NEXT: br label [[IF_END13_I_I46]]
-; CHECK: if.end13.i.i46:
-; CHECK-NEXT: [[P1_ADDR_2_I_I43:%.*]] = phi i32 [ [[CONV12_I_I41]], [[IF_THEN10_I_I42]] ], [ [[P1_ADDR_1_I_I38]], [[IF_END7_I_I39]] ]
-; CHECK-NEXT: br i1 [[TOBOOL15_I_I]], label [[_Z3FN2III_EXIT_I52:%.*]], label [[IF_THEN16_I_I49:%.*]]
-; CHECK: if.then16.i.i49:
-; CHECK-NEXT: [[AND17_I_I47:%.*]] = lshr i32 [[P1_ADDR_2_I_I43]], 3
-; CHECK-NEXT: [[AND17_LOBIT_I_I48:%.*]] = and i32 [[AND17_I_I47]], 1
-; CHECK-NEXT: br label [[_Z3FN2III_EXIT_I52]]
-; CHECK: _Z3fn2iii.exit.i52:
-; CHECK-NEXT: [[P1_ADDR_3_I_I50:%.*]] = phi i32 [ [[AND17_LOBIT_I_I48]], [[IF_THEN16_I_I49]] ], [ [[P1_ADDR_2_I_I43]], [[IF_END13_I_I46]] ]
-; CHECK-NEXT: br i1 [[TOBOOL_I11_I]], label [[_ZL3FN3II_EXIT58]], label [[IF_THEN_I15_I56:%.*]]
-; CHECK: if.then.i15.i56:
-; CHECK-NEXT: [[AND_I12_I53:%.*]] = lshr i32 [[P1_ADDR_3_I_I50]], 2
-; CHECK-NEXT: [[AND_LOBIT_I13_I54:%.*]] = and i32 [[AND_I12_I53]], 1
-; CHECK-NEXT: [[TMP12:%.*]] = xor i32 [[AND_LOBIT_I13_I54]], 1
-; CHECK-NEXT: [[OR_I14_I55:%.*]] = or i32 [[TMP12]], [[P1_ADDR_3_I_I50]]
-; CHECK-NEXT: br label [[_ZL3FN3II_EXIT58]]
-; CHECK: _ZL3fn3ii.exit58:
-; CHECK-NEXT: [[P1_ADDR_0_I16_I57:%.*]] = phi i32 [ [[OR_I14_I55]], [[IF_THEN_I15_I56]] ], [ [[P1_ADDR_3_I_I50]], [[_Z3FN2III_EXIT_I52]] ]
-; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store i32 [[P1_ADDR_0_I16_I57]], i32* [[ARRAYIDX7]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i32 [[LFTR_WIDEIV]], [[P1]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]]
-; CHECK: for.cond.for.end_crit_edge:
-; CHECK-NEXT: br label [[FOR_END]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
entry:
%cmp88 = icmp sgt i32 %p1, 0
br i1 %cmp88, label %for.body.lr.ph, label %for.end
; CHECK-NEXT: store <4 x i32> [[PREDPHI7]], <4 x i32>* [[TMP17]], align 4, !alias.scope !0, !noalias !3
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP5:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], [[LOOP7:!llvm.loop !.*]]
; CHECK: for.end.loopexit:
; CHECK-NEXT: br label [[FOR_END]]
; CHECK: for.end:
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -verify-loop-info -simplifycfg -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
; Test no-predication of instructions that are provably safe, e.g. dividing by
; a non-zero constant.
define void @test(i32* nocapture %asd, i32* nocapture %aud,
-; CHECK-LABEL: @test(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[ASD1:%.*]] = bitcast i32* [[ASD:%.*]] to i8*
-; CHECK-NEXT: [[AUD3:%.*]] = bitcast i32* [[AUD:%.*]] to i8*
-; CHECK-NEXT: [[ASR6:%.*]] = bitcast i32* [[ASR:%.*]] to i8*
-; CHECK-NEXT: [[AUR9:%.*]] = bitcast i32* [[AUR:%.*]] to i8*
-; CHECK-NEXT: [[ASD012:%.*]] = bitcast i32* [[ASD0:%.*]] to i8*
-; CHECK-NEXT: [[AUD015:%.*]] = bitcast i32* [[AUD0:%.*]] to i8*
-; CHECK-NEXT: [[ASR018:%.*]] = bitcast i32* [[ASR0:%.*]] to i8*
-; CHECK-NEXT: [[AUR021:%.*]] = bitcast i32* [[AUR0:%.*]] to i8*
-; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[ASD]], i64 128
-; CHECK-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
-; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[AUD]], i64 128
-; CHECK-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8*
-; CHECK-NEXT: [[SCEVGEP7:%.*]] = getelementptr i32, i32* [[ASR]], i64 128
-; CHECK-NEXT: [[SCEVGEP78:%.*]] = bitcast i32* [[SCEVGEP7]] to i8*
-; CHECK-NEXT: [[SCEVGEP10:%.*]] = getelementptr i32, i32* [[AUR]], i64 128
-; CHECK-NEXT: [[SCEVGEP1011:%.*]] = bitcast i32* [[SCEVGEP10]] to i8*
-; CHECK-NEXT: [[SCEVGEP13:%.*]] = getelementptr i32, i32* [[ASD0]], i64 128
-; CHECK-NEXT: [[SCEVGEP1314:%.*]] = bitcast i32* [[SCEVGEP13]] to i8*
-; CHECK-NEXT: [[SCEVGEP16:%.*]] = getelementptr i32, i32* [[AUD0]], i64 128
-; CHECK-NEXT: [[SCEVGEP1617:%.*]] = bitcast i32* [[SCEVGEP16]] to i8*
-; CHECK-NEXT: [[SCEVGEP19:%.*]] = getelementptr i32, i32* [[ASR0]], i64 128
-; CHECK-NEXT: [[SCEVGEP1920:%.*]] = bitcast i32* [[SCEVGEP19]] to i8*
-; CHECK-NEXT: [[SCEVGEP22:%.*]] = getelementptr i32, i32* [[AUR0]], i64 128
-; CHECK-NEXT: [[SCEVGEP2223:%.*]] = bitcast i32* [[SCEVGEP22]] to i8*
-; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[ASD1]], [[SCEVGEP45]]
-; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[AUD3]], [[SCEVGEP2]]
-; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT: [[BOUND024:%.*]] = icmp ult i8* [[ASD1]], [[SCEVGEP78]]
-; CHECK-NEXT: [[BOUND125:%.*]] = icmp ult i8* [[ASR6]], [[SCEVGEP2]]
-; CHECK-NEXT: [[FOUND_CONFLICT26:%.*]] = and i1 [[BOUND024]], [[BOUND125]]
-; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT26]]
-; CHECK-NEXT: [[BOUND027:%.*]] = icmp ult i8* [[ASD1]], [[SCEVGEP1011]]
-; CHECK-NEXT: [[BOUND128:%.*]] = icmp ult i8* [[AUR9]], [[SCEVGEP2]]
-; CHECK-NEXT: [[FOUND_CONFLICT29:%.*]] = and i1 [[BOUND027]], [[BOUND128]]
-; CHECK-NEXT: [[CONFLICT_RDX30:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT29]]
-; CHECK-NEXT: [[BOUND031:%.*]] = icmp ult i8* [[ASD1]], [[SCEVGEP1314]]
-; CHECK-NEXT: [[BOUND132:%.*]] = icmp ult i8* [[ASD012]], [[SCEVGEP2]]
-; CHECK-NEXT: [[FOUND_CONFLICT33:%.*]] = and i1 [[BOUND031]], [[BOUND132]]
-; CHECK-NEXT: [[CONFLICT_RDX34:%.*]] = or i1 [[CONFLICT_RDX30]], [[FOUND_CONFLICT33]]
-; CHECK-NEXT: [[BOUND035:%.*]] = icmp ult i8* [[ASD1]], [[SCEVGEP1617]]
-; CHECK-NEXT: [[BOUND136:%.*]] = icmp ult i8* [[AUD015]], [[SCEVGEP2]]
-; CHECK-NEXT: [[FOUND_CONFLICT37:%.*]] = and i1 [[BOUND035]], [[BOUND136]]
-; CHECK-NEXT: [[CONFLICT_RDX38:%.*]] = or i1 [[CONFLICT_RDX34]], [[FOUND_CONFLICT37]]
-; CHECK-NEXT: [[BOUND039:%.*]] = icmp ult i8* [[ASD1]], [[SCEVGEP1920]]
-; CHECK-NEXT: [[BOUND140:%.*]] = icmp ult i8* [[ASR018]], [[SCEVGEP2]]
-; CHECK-NEXT: [[FOUND_CONFLICT41:%.*]] = and i1 [[BOUND039]], [[BOUND140]]
-; CHECK-NEXT: [[CONFLICT_RDX42:%.*]] = or i1 [[CONFLICT_RDX38]], [[FOUND_CONFLICT41]]
-; CHECK-NEXT: [[BOUND043:%.*]] = icmp ult i8* [[ASD1]], [[SCEVGEP2223]]
-; CHECK-NEXT: [[BOUND144:%.*]] = icmp ult i8* [[AUR021]], [[SCEVGEP2]]
-; CHECK-NEXT: [[FOUND_CONFLICT45:%.*]] = and i1 [[BOUND043]], [[BOUND144]]
-; CHECK-NEXT: [[CONFLICT_RDX46:%.*]] = or i1 [[CONFLICT_RDX42]], [[FOUND_CONFLICT45]]
-; CHECK-NEXT: [[BOUND047:%.*]] = icmp ult i8* [[AUD3]], [[SCEVGEP78]]
-; CHECK-NEXT: [[BOUND148:%.*]] = icmp ult i8* [[ASR6]], [[SCEVGEP45]]
-; CHECK-NEXT: [[FOUND_CONFLICT49:%.*]] = and i1 [[BOUND047]], [[BOUND148]]
-; CHECK-NEXT: [[CONFLICT_RDX50:%.*]] = or i1 [[CONFLICT_RDX46]], [[FOUND_CONFLICT49]]
-; CHECK-NEXT: [[BOUND051:%.*]] = icmp ult i8* [[AUD3]], [[SCEVGEP1011]]
-; CHECK-NEXT: [[BOUND152:%.*]] = icmp ult i8* [[AUR9]], [[SCEVGEP45]]
-; CHECK-NEXT: [[FOUND_CONFLICT53:%.*]] = and i1 [[BOUND051]], [[BOUND152]]
-; CHECK-NEXT: [[CONFLICT_RDX54:%.*]] = or i1 [[CONFLICT_RDX50]], [[FOUND_CONFLICT53]]
-; CHECK-NEXT: [[BOUND055:%.*]] = icmp ult i8* [[AUD3]], [[SCEVGEP1314]]
-; CHECK-NEXT: [[BOUND156:%.*]] = icmp ult i8* [[ASD012]], [[SCEVGEP45]]
-; CHECK-NEXT: [[FOUND_CONFLICT57:%.*]] = and i1 [[BOUND055]], [[BOUND156]]
-; CHECK-NEXT: [[CONFLICT_RDX58:%.*]] = or i1 [[CONFLICT_RDX54]], [[FOUND_CONFLICT57]]
-; CHECK-NEXT: [[BOUND059:%.*]] = icmp ult i8* [[AUD3]], [[SCEVGEP1617]]
-; CHECK-NEXT: [[BOUND160:%.*]] = icmp ult i8* [[AUD015]], [[SCEVGEP45]]
-; CHECK-NEXT: [[FOUND_CONFLICT61:%.*]] = and i1 [[BOUND059]], [[BOUND160]]
-; CHECK-NEXT: [[CONFLICT_RDX62:%.*]] = or i1 [[CONFLICT_RDX58]], [[FOUND_CONFLICT61]]
-; CHECK-NEXT: [[BOUND063:%.*]] = icmp ult i8* [[AUD3]], [[SCEVGEP1920]]
-; CHECK-NEXT: [[BOUND164:%.*]] = icmp ult i8* [[ASR018]], [[SCEVGEP45]]
-; CHECK-NEXT: [[FOUND_CONFLICT65:%.*]] = and i1 [[BOUND063]], [[BOUND164]]
-; CHECK-NEXT: [[CONFLICT_RDX66:%.*]] = or i1 [[CONFLICT_RDX62]], [[FOUND_CONFLICT65]]
-; CHECK-NEXT: [[BOUND067:%.*]] = icmp ult i8* [[AUD3]], [[SCEVGEP2223]]
-; CHECK-NEXT: [[BOUND168:%.*]] = icmp ult i8* [[AUR021]], [[SCEVGEP45]]
-; CHECK-NEXT: [[FOUND_CONFLICT69:%.*]] = and i1 [[BOUND067]], [[BOUND168]]
-; CHECK-NEXT: [[CONFLICT_RDX70:%.*]] = or i1 [[CONFLICT_RDX66]], [[FOUND_CONFLICT69]]
-; CHECK-NEXT: [[BOUND071:%.*]] = icmp ult i8* [[ASR6]], [[SCEVGEP1011]]
-; CHECK-NEXT: [[BOUND172:%.*]] = icmp ult i8* [[AUR9]], [[SCEVGEP78]]
-; CHECK-NEXT: [[FOUND_CONFLICT73:%.*]] = and i1 [[BOUND071]], [[BOUND172]]
-; CHECK-NEXT: [[CONFLICT_RDX74:%.*]] = or i1 [[CONFLICT_RDX70]], [[FOUND_CONFLICT73]]
-; CHECK-NEXT: [[BOUND075:%.*]] = icmp ult i8* [[ASR6]], [[SCEVGEP1314]]
-; CHECK-NEXT: [[BOUND176:%.*]] = icmp ult i8* [[ASD012]], [[SCEVGEP78]]
-; CHECK-NEXT: [[FOUND_CONFLICT77:%.*]] = and i1 [[BOUND075]], [[BOUND176]]
-; CHECK-NEXT: [[CONFLICT_RDX78:%.*]] = or i1 [[CONFLICT_RDX74]], [[FOUND_CONFLICT77]]
-; CHECK-NEXT: [[BOUND079:%.*]] = icmp ult i8* [[ASR6]], [[SCEVGEP1617]]
-; CHECK-NEXT: [[BOUND180:%.*]] = icmp ult i8* [[AUD015]], [[SCEVGEP78]]
-; CHECK-NEXT: [[FOUND_CONFLICT81:%.*]] = and i1 [[BOUND079]], [[BOUND180]]
-; CHECK-NEXT: [[CONFLICT_RDX82:%.*]] = or i1 [[CONFLICT_RDX78]], [[FOUND_CONFLICT81]]
-; CHECK-NEXT: [[BOUND083:%.*]] = icmp ult i8* [[ASR6]], [[SCEVGEP1920]]
-; CHECK-NEXT: [[BOUND184:%.*]] = icmp ult i8* [[ASR018]], [[SCEVGEP78]]
-; CHECK-NEXT: [[FOUND_CONFLICT85:%.*]] = and i1 [[BOUND083]], [[BOUND184]]
-; CHECK-NEXT: [[CONFLICT_RDX86:%.*]] = or i1 [[CONFLICT_RDX82]], [[FOUND_CONFLICT85]]
-; CHECK-NEXT: [[BOUND087:%.*]] = icmp ult i8* [[ASR6]], [[SCEVGEP2223]]
-; CHECK-NEXT: [[BOUND188:%.*]] = icmp ult i8* [[AUR021]], [[SCEVGEP78]]
-; CHECK-NEXT: [[FOUND_CONFLICT89:%.*]] = and i1 [[BOUND087]], [[BOUND188]]
-; CHECK-NEXT: [[CONFLICT_RDX90:%.*]] = or i1 [[CONFLICT_RDX86]], [[FOUND_CONFLICT89]]
-; CHECK-NEXT: [[BOUND091:%.*]] = icmp ult i8* [[AUR9]], [[SCEVGEP1314]]
-; CHECK-NEXT: [[BOUND192:%.*]] = icmp ult i8* [[ASD012]], [[SCEVGEP1011]]
-; CHECK-NEXT: [[FOUND_CONFLICT93:%.*]] = and i1 [[BOUND091]], [[BOUND192]]
-; CHECK-NEXT: [[CONFLICT_RDX94:%.*]] = or i1 [[CONFLICT_RDX90]], [[FOUND_CONFLICT93]]
-; CHECK-NEXT: [[BOUND095:%.*]] = icmp ult i8* [[AUR9]], [[SCEVGEP1617]]
-; CHECK-NEXT: [[BOUND196:%.*]] = icmp ult i8* [[AUD015]], [[SCEVGEP1011]]
-; CHECK-NEXT: [[FOUND_CONFLICT97:%.*]] = and i1 [[BOUND095]], [[BOUND196]]
-; CHECK-NEXT: [[CONFLICT_RDX98:%.*]] = or i1 [[CONFLICT_RDX94]], [[FOUND_CONFLICT97]]
-; CHECK-NEXT: [[BOUND099:%.*]] = icmp ult i8* [[AUR9]], [[SCEVGEP1920]]
-; CHECK-NEXT: [[BOUND1100:%.*]] = icmp ult i8* [[ASR018]], [[SCEVGEP1011]]
-; CHECK-NEXT: [[FOUND_CONFLICT101:%.*]] = and i1 [[BOUND099]], [[BOUND1100]]
-; CHECK-NEXT: [[CONFLICT_RDX102:%.*]] = or i1 [[CONFLICT_RDX98]], [[FOUND_CONFLICT101]]
-; CHECK-NEXT: [[BOUND0103:%.*]] = icmp ult i8* [[AUR9]], [[SCEVGEP2223]]
-; CHECK-NEXT: [[BOUND1104:%.*]] = icmp ult i8* [[AUR021]], [[SCEVGEP1011]]
-; CHECK-NEXT: [[FOUND_CONFLICT105:%.*]] = and i1 [[BOUND0103]], [[BOUND1104]]
-; CHECK-NEXT: [[CONFLICT_RDX106:%.*]] = or i1 [[CONFLICT_RDX102]], [[FOUND_CONFLICT105]]
-; CHECK-NEXT: [[BOUND0107:%.*]] = icmp ult i8* [[ASD012]], [[SCEVGEP1617]]
-; CHECK-NEXT: [[BOUND1108:%.*]] = icmp ult i8* [[AUD015]], [[SCEVGEP1314]]
-; CHECK-NEXT: [[FOUND_CONFLICT109:%.*]] = and i1 [[BOUND0107]], [[BOUND1108]]
-; CHECK-NEXT: [[CONFLICT_RDX110:%.*]] = or i1 [[CONFLICT_RDX106]], [[FOUND_CONFLICT109]]
-; CHECK-NEXT: [[BOUND0111:%.*]] = icmp ult i8* [[ASD012]], [[SCEVGEP1920]]
-; CHECK-NEXT: [[BOUND1112:%.*]] = icmp ult i8* [[ASR018]], [[SCEVGEP1314]]
-; CHECK-NEXT: [[FOUND_CONFLICT113:%.*]] = and i1 [[BOUND0111]], [[BOUND1112]]
-; CHECK-NEXT: [[CONFLICT_RDX114:%.*]] = or i1 [[CONFLICT_RDX110]], [[FOUND_CONFLICT113]]
-; CHECK-NEXT: [[BOUND0115:%.*]] = icmp ult i8* [[ASD012]], [[SCEVGEP2223]]
-; CHECK-NEXT: [[BOUND1116:%.*]] = icmp ult i8* [[AUR021]], [[SCEVGEP1314]]
-; CHECK-NEXT: [[FOUND_CONFLICT117:%.*]] = and i1 [[BOUND0115]], [[BOUND1116]]
-; CHECK-NEXT: [[CONFLICT_RDX118:%.*]] = or i1 [[CONFLICT_RDX114]], [[FOUND_CONFLICT117]]
-; CHECK-NEXT: [[BOUND0119:%.*]] = icmp ult i8* [[AUD015]], [[SCEVGEP1920]]
-; CHECK-NEXT: [[BOUND1120:%.*]] = icmp ult i8* [[ASR018]], [[SCEVGEP1617]]
-; CHECK-NEXT: [[FOUND_CONFLICT121:%.*]] = and i1 [[BOUND0119]], [[BOUND1120]]
-; CHECK-NEXT: [[CONFLICT_RDX122:%.*]] = or i1 [[CONFLICT_RDX118]], [[FOUND_CONFLICT121]]
-; CHECK-NEXT: [[BOUND0123:%.*]] = icmp ult i8* [[AUD015]], [[SCEVGEP2223]]
-; CHECK-NEXT: [[BOUND1124:%.*]] = icmp ult i8* [[AUR021]], [[SCEVGEP1617]]
-; CHECK-NEXT: [[FOUND_CONFLICT125:%.*]] = and i1 [[BOUND0123]], [[BOUND1124]]
-; CHECK-NEXT: [[CONFLICT_RDX126:%.*]] = or i1 [[CONFLICT_RDX122]], [[FOUND_CONFLICT125]]
-; CHECK-NEXT: [[BOUND0127:%.*]] = icmp ult i8* [[ASR018]], [[SCEVGEP2223]]
-; CHECK-NEXT: [[BOUND1128:%.*]] = icmp ult i8* [[AUR021]], [[SCEVGEP1920]]
-; CHECK-NEXT: [[FOUND_CONFLICT129:%.*]] = and i1 [[BOUND0127]], [[BOUND1128]]
-; CHECK-NEXT: [[CONFLICT_RDX130:%.*]] = or i1 [[CONFLICT_RDX126]], [[FOUND_CONFLICT129]]
-; CHECK-NEXT: br i1 [[CONFLICT_RDX130]], label [[SCALAR_PH:%.*]], label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[PRED_UREM_CONTINUE139:%.*]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[ASD]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[AUD]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[ASR]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[AUR]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <2 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP6]], align 4, !alias.scope !0, !noalias !3
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <2 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD131:%.*]] = load <2 x i32>, <2 x i32>* [[TMP8]], align 4, !alias.scope !11, !noalias !12
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <2 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD132:%.*]] = load <2 x i32>, <2 x i32>* [[TMP10]], align 4, !alias.scope !13, !noalias !14
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <2 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD133:%.*]] = load <2 x i32>, <2 x i32>* [[TMP12]], align 4, !alias.scope !15, !noalias !16
-; CHECK-NEXT: [[TMP13:%.*]] = add nsw <2 x i32> [[WIDE_LOAD]], <i32 23, i32 23>
-; CHECK-NEXT: [[TMP14:%.*]] = add nsw <2 x i32> [[WIDE_LOAD131]], <i32 24, i32 24>
-; CHECK-NEXT: [[TMP15:%.*]] = add nsw <2 x i32> [[WIDE_LOAD132]], <i32 25, i32 25>
-; CHECK-NEXT: [[TMP16:%.*]] = add nsw <2 x i32> [[WIDE_LOAD133]], <i32 26, i32 26>
-; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[ASD0]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[AUD0]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[ASR0]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[AUR0]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP17]], i32 0
-; CHECK-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <2 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD134:%.*]] = load <2 x i32>, <2 x i32>* [[TMP22]], align 4, !alias.scope !17, !noalias !18
-; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[TMP18]], i32 0
-; CHECK-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <2 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD135:%.*]] = load <2 x i32>, <2 x i32>* [[TMP24]], align 4, !alias.scope !19, !noalias !20
-; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TMP19]], i32 0
-; CHECK-NEXT: [[TMP26:%.*]] = bitcast i32* [[TMP25]] to <2 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD136:%.*]] = load <2 x i32>, <2 x i32>* [[TMP26]], align 4, !alias.scope !21, !noalias !22
-; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 0
-; CHECK-NEXT: [[TMP28:%.*]] = bitcast i32* [[TMP27]] to <2 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD137:%.*]] = load <2 x i32>, <2 x i32>* [[TMP28]], align 4, !alias.scope !22
-; CHECK-NEXT: [[TMP29:%.*]] = add nsw <2 x i32> [[WIDE_LOAD]], <i32 27, i32 27>
-; CHECK-NEXT: [[TMP30:%.*]] = add nsw <2 x i32> [[WIDE_LOAD131]], <i32 28, i32 28>
-; CHECK-NEXT: [[TMP31:%.*]] = add nsw <2 x i32> [[WIDE_LOAD132]], <i32 29, i32 29>
-; CHECK-NEXT: [[TMP32:%.*]] = add nsw <2 x i32> [[WIDE_LOAD133]], <i32 30, i32 30>
-; CHECK-NEXT: [[TMP33:%.*]] = icmp slt <2 x i32> [[WIDE_LOAD]], <i32 100, i32 100>
-; CHECK-NEXT: [[TMP34:%.*]] = sdiv <2 x i32> [[TMP13]], <i32 11, i32 11>
-; CHECK-NEXT: [[TMP35:%.*]] = udiv <2 x i32> [[TMP14]], <i32 13, i32 13>
-; CHECK-NEXT: [[TMP36:%.*]] = srem <2 x i32> [[TMP15]], <i32 17, i32 17>
-; CHECK-NEXT: [[TMP37:%.*]] = urem <2 x i32> [[TMP16]], <i32 19, i32 19>
-; CHECK-NEXT: [[TMP38:%.*]] = extractelement <2 x i1> [[TMP33]], i32 0
-; CHECK-NEXT: br i1 [[TMP38]], label [[PRED_UREM_IF:%.*]], label [[PRED_UREM_CONTINUE:%.*]]
-; CHECK: pred.urem.if:
-; CHECK-NEXT: [[TMP39:%.*]] = extractelement <2 x i32> [[TMP29]], i32 0
-; CHECK-NEXT: [[TMP40:%.*]] = sdiv i32 [[TMP39]], 0
-; CHECK-NEXT: [[TMP41:%.*]] = insertelement <2 x i32> poison, i32 [[TMP40]], i32 0
-; CHECK-NEXT: [[TMP42:%.*]] = extractelement <2 x i32> [[TMP30]], i32 0
-; CHECK-NEXT: [[TMP43:%.*]] = udiv i32 [[TMP42]], 0
-; CHECK-NEXT: [[TMP44:%.*]] = insertelement <2 x i32> poison, i32 [[TMP43]], i32 0
-; CHECK-NEXT: [[TMP45:%.*]] = extractelement <2 x i32> [[TMP31]], i32 0
-; CHECK-NEXT: [[TMP46:%.*]] = srem i32 [[TMP45]], 0
-; CHECK-NEXT: [[TMP47:%.*]] = insertelement <2 x i32> poison, i32 [[TMP46]], i32 0
-; CHECK-NEXT: [[TMP48:%.*]] = extractelement <2 x i32> [[TMP32]], i32 0
-; CHECK-NEXT: [[TMP49:%.*]] = urem i32 [[TMP48]], 0
-; CHECK-NEXT: [[TMP50:%.*]] = insertelement <2 x i32> poison, i32 [[TMP49]], i32 0
-; CHECK-NEXT: br label [[PRED_UREM_CONTINUE]]
-; CHECK: pred.urem.continue:
-; CHECK-NEXT: [[TMP51:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP41]], [[PRED_UREM_IF]] ]
-; CHECK-NEXT: [[TMP52:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP44]], [[PRED_UREM_IF]] ]
-; CHECK-NEXT: [[TMP53:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP47]], [[PRED_UREM_IF]] ]
-; CHECK-NEXT: [[TMP54:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP50]], [[PRED_UREM_IF]] ]
-; CHECK-NEXT: [[TMP55:%.*]] = extractelement <2 x i1> [[TMP33]], i32 1
-; CHECK-NEXT: br i1 [[TMP55]], label [[PRED_UREM_IF138:%.*]], label [[PRED_UREM_CONTINUE139]]
-; CHECK: pred.urem.if138:
-; CHECK-NEXT: [[TMP56:%.*]] = extractelement <2 x i32> [[TMP29]], i32 1
-; CHECK-NEXT: [[TMP57:%.*]] = sdiv i32 [[TMP56]], 0
-; CHECK-NEXT: [[TMP58:%.*]] = insertelement <2 x i32> [[TMP51]], i32 [[TMP57]], i32 1
-; CHECK-NEXT: [[TMP59:%.*]] = extractelement <2 x i32> [[TMP30]], i32 1
-; CHECK-NEXT: [[TMP60:%.*]] = udiv i32 [[TMP59]], 0
-; CHECK-NEXT: [[TMP61:%.*]] = insertelement <2 x i32> [[TMP52]], i32 [[TMP60]], i32 1
-; CHECK-NEXT: [[TMP62:%.*]] = extractelement <2 x i32> [[TMP31]], i32 1
-; CHECK-NEXT: [[TMP63:%.*]] = srem i32 [[TMP62]], 0
-; CHECK-NEXT: [[TMP64:%.*]] = insertelement <2 x i32> [[TMP53]], i32 [[TMP63]], i32 1
-; CHECK-NEXT: [[TMP65:%.*]] = extractelement <2 x i32> [[TMP32]], i32 1
-; CHECK-NEXT: [[TMP66:%.*]] = urem i32 [[TMP65]], 0
-; CHECK-NEXT: [[TMP67:%.*]] = insertelement <2 x i32> [[TMP54]], i32 [[TMP66]], i32 1
-; CHECK-NEXT: br label [[PRED_UREM_CONTINUE139]]
-; CHECK: pred.urem.continue139:
-; CHECK-NEXT: [[TMP68:%.*]] = phi <2 x i32> [ [[TMP51]], [[PRED_UREM_CONTINUE]] ], [ [[TMP58]], [[PRED_UREM_IF138]] ]
-; CHECK-NEXT: [[TMP69:%.*]] = phi <2 x i32> [ [[TMP52]], [[PRED_UREM_CONTINUE]] ], [ [[TMP61]], [[PRED_UREM_IF138]] ]
-; CHECK-NEXT: [[TMP70:%.*]] = phi <2 x i32> [ [[TMP53]], [[PRED_UREM_CONTINUE]] ], [ [[TMP64]], [[PRED_UREM_IF138]] ]
-; CHECK-NEXT: [[TMP71:%.*]] = phi <2 x i32> [ [[TMP54]], [[PRED_UREM_CONTINUE]] ], [ [[TMP67]], [[PRED_UREM_IF138]] ]
-; CHECK-NEXT: [[TMP72:%.*]] = xor <2 x i1> [[TMP33]], <i1 true, i1 true>
-; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP72]], <2 x i32> [[TMP13]], <2 x i32> [[TMP34]]
-; CHECK-NEXT: [[PREDPHI140:%.*]] = select <2 x i1> [[TMP72]], <2 x i32> [[TMP14]], <2 x i32> [[TMP35]]
-; CHECK-NEXT: [[PREDPHI141:%.*]] = select <2 x i1> [[TMP72]], <2 x i32> [[TMP15]], <2 x i32> [[TMP36]]
-; CHECK-NEXT: [[PREDPHI142:%.*]] = select <2 x i1> [[TMP72]], <2 x i32> [[TMP16]], <2 x i32> [[TMP37]]
-; CHECK-NEXT: [[PREDPHI143:%.*]] = select <2 x i1> [[TMP72]], <2 x i32> [[TMP29]], <2 x i32> [[TMP68]]
-; CHECK-NEXT: [[PREDPHI144:%.*]] = select <2 x i1> [[TMP72]], <2 x i32> [[TMP30]], <2 x i32> [[TMP69]]
-; CHECK-NEXT: [[PREDPHI145:%.*]] = select <2 x i1> [[TMP72]], <2 x i32> [[TMP31]], <2 x i32> [[TMP70]]
-; CHECK-NEXT: [[PREDPHI146:%.*]] = select <2 x i1> [[TMP72]], <2 x i32> [[TMP32]], <2 x i32> [[TMP71]]
-; CHECK-NEXT: [[TMP73:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP74:%.*]] = bitcast i32* [[TMP73]] to <2 x i32>*
-; CHECK-NEXT: store <2 x i32> [[PREDPHI]], <2 x i32>* [[TMP74]], align 4, !alias.scope !0, !noalias !3
-; CHECK-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP76:%.*]] = bitcast i32* [[TMP75]] to <2 x i32>*
-; CHECK-NEXT: store <2 x i32> [[PREDPHI140]], <2 x i32>* [[TMP76]], align 4, !alias.scope !11, !noalias !12
-; CHECK-NEXT: [[TMP77:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP78:%.*]] = bitcast i32* [[TMP77]] to <2 x i32>*
-; CHECK-NEXT: store <2 x i32> [[PREDPHI141]], <2 x i32>* [[TMP78]], align 4, !alias.scope !13, !noalias !14
-; CHECK-NEXT: [[TMP79:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
-; CHECK-NEXT: [[TMP80:%.*]] = bitcast i32* [[TMP79]] to <2 x i32>*
-; CHECK-NEXT: store <2 x i32> [[PREDPHI142]], <2 x i32>* [[TMP80]], align 4, !alias.scope !15, !noalias !16
-; CHECK-NEXT: [[TMP81:%.*]] = getelementptr inbounds i32, i32* [[TMP17]], i32 0
-; CHECK-NEXT: [[TMP82:%.*]] = bitcast i32* [[TMP81]] to <2 x i32>*
-; CHECK-NEXT: store <2 x i32> [[PREDPHI143]], <2 x i32>* [[TMP82]], align 4, !alias.scope !17, !noalias !18
-; CHECK-NEXT: [[TMP83:%.*]] = getelementptr inbounds i32, i32* [[TMP18]], i32 0
-; CHECK-NEXT: [[TMP84:%.*]] = bitcast i32* [[TMP83]] to <2 x i32>*
-; CHECK-NEXT: store <2 x i32> [[PREDPHI144]], <2 x i32>* [[TMP84]], align 4, !alias.scope !19, !noalias !20
-; CHECK-NEXT: [[TMP85:%.*]] = getelementptr inbounds i32, i32* [[TMP19]], i32 0
-; CHECK-NEXT: [[TMP86:%.*]] = bitcast i32* [[TMP85]] to <2 x i32>*
-; CHECK-NEXT: store <2 x i32> [[PREDPHI145]], <2 x i32>* [[TMP86]], align 4, !alias.scope !21, !noalias !22
-; CHECK-NEXT: [[TMP87:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 0
-; CHECK-NEXT: [[TMP88:%.*]] = bitcast i32* [[TMP87]] to <2 x i32>*
-; CHECK-NEXT: store <2 x i32> [[PREDPHI146]], <2 x i32>* [[TMP88]], align 4, !alias.scope !22
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP89:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
-; CHECK-NEXT: br i1 [[TMP89]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 128, 128
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 128, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: ret void
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[IF_END:%.*]] ]
-; CHECK-NEXT: [[ISD:%.*]] = getelementptr inbounds i32, i32* [[ASD]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[IUD:%.*]] = getelementptr inbounds i32, i32* [[AUD]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[ISR:%.*]] = getelementptr inbounds i32, i32* [[ASR]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[IUR:%.*]] = getelementptr inbounds i32, i32* [[AUR]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[LSD:%.*]] = load i32, i32* [[ISD]], align 4
-; CHECK-NEXT: [[LUD:%.*]] = load i32, i32* [[IUD]], align 4
-; CHECK-NEXT: [[LSR:%.*]] = load i32, i32* [[ISR]], align 4
-; CHECK-NEXT: [[LUR:%.*]] = load i32, i32* [[IUR]], align 4
-; CHECK-NEXT: [[PSD:%.*]] = add nsw i32 [[LSD]], 23
-; CHECK-NEXT: [[PUD:%.*]] = add nsw i32 [[LUD]], 24
-; CHECK-NEXT: [[PSR:%.*]] = add nsw i32 [[LSR]], 25
-; CHECK-NEXT: [[PUR:%.*]] = add nsw i32 [[LUR]], 26
-; CHECK-NEXT: [[ISD0:%.*]] = getelementptr inbounds i32, i32* [[ASD0]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[IUD0:%.*]] = getelementptr inbounds i32, i32* [[AUD0]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[ISR0:%.*]] = getelementptr inbounds i32, i32* [[ASR0]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[IUR0:%.*]] = getelementptr inbounds i32, i32* [[AUR0]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[LSD0:%.*]] = load i32, i32* [[ISD0]], align 4
-; CHECK-NEXT: [[LUD0:%.*]] = load i32, i32* [[IUD0]], align 4
-; CHECK-NEXT: [[LSR0:%.*]] = load i32, i32* [[ISR0]], align 4
-; CHECK-NEXT: [[LUR0:%.*]] = load i32, i32* [[IUR0]], align 4
-; CHECK-NEXT: [[PSD0:%.*]] = add nsw i32 [[LSD]], 27
-; CHECK-NEXT: [[PUD0:%.*]] = add nsw i32 [[LUD]], 28
-; CHECK-NEXT: [[PSR0:%.*]] = add nsw i32 [[LSR]], 29
-; CHECK-NEXT: [[PUR0:%.*]] = add nsw i32 [[LUR]], 30
-; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 [[LSD]], 100
-; CHECK-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[IF_END]]
-; CHECK: if.then:
-; CHECK-NEXT: [[RSD:%.*]] = sdiv i32 [[PSD]], 11
-; CHECK-NEXT: [[RUD:%.*]] = udiv i32 [[PUD]], 13
-; CHECK-NEXT: [[RSR:%.*]] = srem i32 [[PSR]], 17
-; CHECK-NEXT: [[RUR:%.*]] = urem i32 [[PUR]], 19
-; CHECK-NEXT: [[RSD0:%.*]] = sdiv i32 [[PSD0]], 0
-; CHECK-NEXT: [[RUD0:%.*]] = udiv i32 [[PUD0]], 0
-; CHECK-NEXT: [[RSR0:%.*]] = srem i32 [[PSR0]], 0
-; CHECK-NEXT: [[RUR0:%.*]] = urem i32 [[PUR0]], 0
-; CHECK-NEXT: br label [[IF_END]]
-; CHECK: if.end:
-; CHECK-NEXT: [[YSD_0:%.*]] = phi i32 [ [[RSD]], [[IF_THEN]] ], [ [[PSD]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[YUD_0:%.*]] = phi i32 [ [[RUD]], [[IF_THEN]] ], [ [[PUD]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[YSR_0:%.*]] = phi i32 [ [[RSR]], [[IF_THEN]] ], [ [[PSR]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[YUR_0:%.*]] = phi i32 [ [[RUR]], [[IF_THEN]] ], [ [[PUR]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[YSD0_0:%.*]] = phi i32 [ [[RSD0]], [[IF_THEN]] ], [ [[PSD0]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[YUD0_0:%.*]] = phi i32 [ [[RUD0]], [[IF_THEN]] ], [ [[PUD0]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[YSR0_0:%.*]] = phi i32 [ [[RSR0]], [[IF_THEN]] ], [ [[PSR0]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[YUR0_0:%.*]] = phi i32 [ [[RUR0]], [[IF_THEN]] ], [ [[PUR0]], [[FOR_BODY]] ]
-; CHECK-NEXT: store i32 [[YSD_0]], i32* [[ISD]], align 4
-; CHECK-NEXT: store i32 [[YUD_0]], i32* [[IUD]], align 4
-; CHECK-NEXT: store i32 [[YSR_0]], i32* [[ISR]], align 4
-; CHECK-NEXT: store i32 [[YUR_0]], i32* [[IUR]], align 4
-; CHECK-NEXT: store i32 [[YSD0_0]], i32* [[ISD0]], align 4
-; CHECK-NEXT: store i32 [[YUD0_0]], i32* [[IUD0]], align 4
-; CHECK-NEXT: store i32 [[YSR0_0]], i32* [[ISR0]], align 4
-; CHECK-NEXT: store i32 [[YUR0_0]], i32* [[IUR0]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 128
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
-;
- i32* nocapture %asr, i32* nocapture %aur,
- i32* nocapture %asd0, i32* nocapture %aud0,
- i32* nocapture %asr0, i32* nocapture %aur0
+ i32* nocapture %asr, i32* nocapture %aur,
+ i32* nocapture %asd0, i32* nocapture %aud0,
+ i32* nocapture %asr0, i32* nocapture %aur0
) {
entry:
br label %for.body
for.cond.cleanup: ; preds = %if.end
ret void
+; CHECK-LABEL: test
+; CHECK: vector.body:
+; CHECK: %{{.*}} = sdiv <2 x i32> %{{.*}}, <i32 11, i32 11>
+; CHECK: %{{.*}} = udiv <2 x i32> %{{.*}}, <i32 13, i32 13>
+; CHECK: %{{.*}} = srem <2 x i32> %{{.*}}, <i32 17, i32 17>
+; CHECK: %{{.*}} = urem <2 x i32> %{{.*}}, <i32 19, i32 19>
+; CHECK-NOT: %{{.*}} = sdiv <2 x i32> %{{.*}}, <i32 0, i32 0>
+; CHECK-NOT: %{{.*}} = udiv <2 x i32> %{{.*}}, <i32 0, i32 0>
+; CHECK-NOT: %{{.*}} = srem <2 x i32> %{{.*}}, <i32 0, i32 0>
+; CHECK-NOT: %{{.*}} = urem <2 x i32> %{{.*}}, <i32 0, i32 0>
for.body: ; preds = %if.end, %entry
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %if.end ]
; UNROLL: vector.body:
; UNROLL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ]
; UNROLL-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[V_2:%.*]], [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[PRED_STORE_CONTINUE4]] ]
-; UNROLL-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[PREDPHI5:%.*]], [[PRED_STORE_CONTINUE4]] ]
+; UNROLL-NEXT: [[VEC_PHI2:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[PREDPHI5:%.*]], [[PRED_STORE_CONTINUE4]] ]
; UNROLL-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[V_1]], [[INDEX]]
; UNROLL-NEXT: br i1 [[COND_2:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE4]]
; UNROLL: pred.store.if:
; UNROLL-NEXT: [[TMP5:%.*]] = getelementptr inbounds [768 x i32], [768 x i32]* [[PTR:%.*]], i64 0, i64 [[INDUCTION]]
; UNROLL-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4
; UNROLL-NEXT: store i32 [[TMP6]], i32* [[TMP5]], align 4
-; UNROLL-NEXT: [[INDUCTION2:%.*]] = add i64 [[OFFSET_IDX]], 1
-; UNROLL-NEXT: [[TMP7:%.*]] = getelementptr inbounds [768 x i32], [768 x i32]* [[PTR]], i64 0, i64 [[INDUCTION2]]
+; UNROLL-NEXT: [[INDUCTION1:%.*]] = add i64 [[OFFSET_IDX]], 1
+; UNROLL-NEXT: [[TMP7:%.*]] = getelementptr inbounds [768 x i32], [768 x i32]* [[PTR]], i64 0, i64 [[INDUCTION1]]
; UNROLL-NEXT: [[TMP8:%.*]] = load i32, i32* [[TMP7]], align 4
; UNROLL-NEXT: store i32 [[TMP8]], i32* [[TMP7]], align 4
; UNROLL-NEXT: br label [[PRED_STORE_CONTINUE4]]
; UNROLL: pred.store.continue4:
; UNROLL-NEXT: [[TMP9:%.*]] = add i32 [[VEC_PHI]], 1
-; UNROLL-NEXT: [[TMP10:%.*]] = add i32 [[VEC_PHI1]], 1
+; UNROLL-NEXT: [[TMP10:%.*]] = add i32 [[VEC_PHI2]], 1
; UNROLL-NEXT: [[TMP11:%.*]] = xor i1 [[COND_2]], true
; UNROLL-NEXT: [[TMP12:%.*]] = xor i1 [[COND_2]], true
; UNROLL-NEXT: [[PREDPHI]] = select i1 [[TMP11]], i32 [[VEC_PHI]], i32 [[TMP9]]
-; UNROLL-NEXT: [[PREDPHI5]] = select i1 [[TMP12]], i32 [[VEC_PHI1]], i32 [[TMP10]]
+; UNROLL-NEXT: [[PREDPHI5]] = select i1 [[TMP12]], i32 [[VEC_PHI2]], i32 [[TMP10]]
; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; UNROLL-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; UNROLL-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; UNROLL-NOSIMPLIFY: vector.body:
; UNROLL-NOSIMPLIFY-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ]
; UNROLL-NOSIMPLIFY-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[V_2:%.*]], [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[PRED_STORE_CONTINUE4]] ]
-; UNROLL-NOSIMPLIFY-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[PREDPHI5:%.*]], [[PRED_STORE_CONTINUE4]] ]
+; UNROLL-NOSIMPLIFY-NEXT: [[VEC_PHI2:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[PREDPHI5:%.*]], [[PRED_STORE_CONTINUE4]] ]
; UNROLL-NOSIMPLIFY-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[V_1]], [[INDEX]]
; UNROLL-NOSIMPLIFY-NEXT: br i1 [[COND_2:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
; UNROLL-NOSIMPLIFY: pred.store.if:
; UNROLL-NOSIMPLIFY: pred.store.continue:
; UNROLL-NOSIMPLIFY-NEXT: br i1 [[COND_2]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4]]
; UNROLL-NOSIMPLIFY: pred.store.if3:
-; UNROLL-NOSIMPLIFY-NEXT: [[INDUCTION2:%.*]] = add i64 [[OFFSET_IDX]], 1
-; UNROLL-NOSIMPLIFY-NEXT: [[TMP6:%.*]] = getelementptr inbounds [768 x i32], [768 x i32]* [[PTR]], i64 0, i64 [[INDUCTION2]]
+; UNROLL-NOSIMPLIFY-NEXT: [[INDUCTION1:%.*]] = add i64 [[OFFSET_IDX]], 1
+; UNROLL-NOSIMPLIFY-NEXT: [[TMP6:%.*]] = getelementptr inbounds [768 x i32], [768 x i32]* [[PTR]], i64 0, i64 [[INDUCTION1]]
; UNROLL-NOSIMPLIFY-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4
; UNROLL-NOSIMPLIFY-NEXT: store i32 [[TMP7]], i32* [[TMP6]], align 4
; UNROLL-NOSIMPLIFY-NEXT: br label [[PRED_STORE_CONTINUE4]]
; UNROLL-NOSIMPLIFY: pred.store.continue4:
; UNROLL-NOSIMPLIFY-NEXT: [[TMP8:%.*]] = add i32 [[VEC_PHI]], 1
-; UNROLL-NOSIMPLIFY-NEXT: [[TMP9:%.*]] = add i32 [[VEC_PHI1]], 1
+; UNROLL-NOSIMPLIFY-NEXT: [[TMP9:%.*]] = add i32 [[VEC_PHI2]], 1
; UNROLL-NOSIMPLIFY-NEXT: [[TMP10:%.*]] = xor i1 [[COND_2]], true
; UNROLL-NOSIMPLIFY-NEXT: [[TMP11:%.*]] = xor i1 [[COND_2]], true
; UNROLL-NOSIMPLIFY-NEXT: [[PREDPHI]] = select i1 [[TMP10]], i32 [[VEC_PHI]], i32 [[TMP8]]
-; UNROLL-NOSIMPLIFY-NEXT: [[PREDPHI5]] = select i1 [[TMP11]], i32 [[VEC_PHI1]], i32 [[TMP9]]
+; UNROLL-NOSIMPLIFY-NEXT: [[PREDPHI5]] = select i1 [[TMP11]], i32 [[VEC_PHI2]], i32 [[TMP9]]
; UNROLL-NOSIMPLIFY-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; UNROLL-NOSIMPLIFY-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; UNROLL-NOSIMPLIFY-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; UNROLL-NEXT: entry:
; UNROLL-NEXT: br label [[VECTOR_BODY:%.*]]
; UNROLL: vector.body:
-; UNROLL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ]
-; UNROLL-NEXT: br i1 [[C:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE4]]
+; UNROLL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
+; UNROLL-NEXT: br i1 [[C:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE6]]
; UNROLL: pred.store.if:
-; UNROLL-NEXT: [[INDUCTION:%.*]] = add i64 [[INDEX]], 0
-; UNROLL-NEXT: [[TMP0:%.*]] = getelementptr i8, i8* undef, i64 [[INDUCTION]]
+; UNROLL-NEXT: [[INDUCTION3:%.*]] = add i64 [[INDEX]], 0
+; UNROLL-NEXT: [[TMP0:%.*]] = getelementptr i8, i8* undef, i64 [[INDUCTION3]]
; UNROLL-NEXT: [[TMP1:%.*]] = load i8, i8* [[TMP0]], align 1
; UNROLL-NEXT: [[TMP2:%.*]] = zext i8 [[TMP1]] to i32
; UNROLL-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
; UNROLL-NEXT: store i8 [[TMP3]], i8* [[TMP0]], align 1
-; UNROLL-NEXT: [[INDUCTION2:%.*]] = add i64 [[INDEX]], 1
-; UNROLL-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* undef, i64 [[INDUCTION2]]
+; UNROLL-NEXT: [[INDUCTION4:%.*]] = add i64 [[INDEX]], 1
+; UNROLL-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* undef, i64 [[INDUCTION4]]
; UNROLL-NEXT: [[TMP5:%.*]] = load i8, i8* [[TMP4]], align 1
; UNROLL-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
; UNROLL-NEXT: [[TMP7:%.*]] = trunc i32 [[TMP6]] to i8
; UNROLL-NEXT: store i8 [[TMP7]], i8* [[TMP4]], align 1
-; UNROLL-NEXT: br label [[PRED_STORE_CONTINUE4]]
+; UNROLL-NEXT: br label [[PRED_STORE_CONTINUE6]]
; UNROLL: pred.store.continue4:
; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; UNROLL-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef
; UNROLL-NOSIMPLIFY: vector.ph:
; UNROLL-NOSIMPLIFY-NEXT: br label [[VECTOR_BODY:%.*]]
; UNROLL-NOSIMPLIFY: vector.body:
-; UNROLL-NOSIMPLIFY-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ]
+; UNROLL-NOSIMPLIFY-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
; UNROLL-NOSIMPLIFY-NEXT: br i1 [[C:%.*]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
; UNROLL-NOSIMPLIFY: pred.store.if:
-; UNROLL-NOSIMPLIFY-NEXT: [[INDUCTION:%.*]] = add i64 [[INDEX]], 0
-; UNROLL-NOSIMPLIFY-NEXT: [[TMP0:%.*]] = getelementptr i8, i8* undef, i64 [[INDUCTION]]
+; UNROLL-NOSIMPLIFY-NEXT: [[INDUCTION3:%.*]] = add i64 [[INDEX]], 0
+; UNROLL-NOSIMPLIFY-NEXT: [[TMP0:%.*]] = getelementptr i8, i8* undef, i64 [[INDUCTION3]]
; UNROLL-NOSIMPLIFY-NEXT: [[TMP1:%.*]] = load i8, i8* [[TMP0]], align 1
; UNROLL-NOSIMPLIFY-NEXT: [[TMP2:%.*]] = zext i8 [[TMP1]] to i32
; UNROLL-NOSIMPLIFY-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
; UNROLL-NOSIMPLIFY-NEXT: store i8 [[TMP3]], i8* [[TMP0]], align 1
; UNROLL-NOSIMPLIFY-NEXT: br label [[PRED_STORE_CONTINUE]]
; UNROLL-NOSIMPLIFY: pred.store.continue:
-; UNROLL-NOSIMPLIFY-NEXT: br i1 [[C]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4]]
+; UNROLL-NOSIMPLIFY-NEXT: br i1 [[C]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]]
; UNROLL-NOSIMPLIFY: pred.store.if3:
; UNROLL-NOSIMPLIFY-NEXT: [[INDUCTION2:%.*]] = add i64 [[INDEX]], 1
; UNROLL-NOSIMPLIFY-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* undef, i64 [[INDUCTION2]]
; UNROLL-NOSIMPLIFY-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
; UNROLL-NOSIMPLIFY-NEXT: [[TMP7:%.*]] = trunc i32 [[TMP6]] to i8
; UNROLL-NOSIMPLIFY-NEXT: store i8 [[TMP7]], i8* [[TMP4]], align 1
-; UNROLL-NOSIMPLIFY-NEXT: br label [[PRED_STORE_CONTINUE4]]
+; UNROLL-NOSIMPLIFY-NEXT: br label [[PRED_STORE_CONTINUE6]]
; UNROLL-NOSIMPLIFY: pred.store.continue4:
; UNROLL-NOSIMPLIFY-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; UNROLL-NOSIMPLIFY-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef
; VEC: vector.body:
; VEC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE3:%.*]] ]
; VEC-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; VEC-NEXT: [[TMP1:%.*]] = getelementptr i8, i8* undef, i64 [[TMP0]]
-; VEC-NEXT: [[TMP2:%.*]] = getelementptr i8, i8* [[TMP1]], i32 0
-; VEC-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <2 x i8>*
-; VEC-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i8>, <2 x i8>* [[TMP3]], align 1
-; VEC-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0
-; VEC-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; VEC-NEXT: [[TMP2:%.*]] = getelementptr i8, i8* undef, i64 [[TMP0]]
+; VEC-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[TMP2]], i32 0
+; VEC-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <2 x i8>*
+; VEC-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i8>, <2 x i8>* [[TMP4]], align 1
+; VEC-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0
+; VEC-NEXT: br i1 [[TMP5]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
; VEC: pred.store.if:
-; VEC-NEXT: [[TMP5:%.*]] = extractelement <2 x i8> [[WIDE_LOAD]], i32 0
-; VEC-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
-; VEC-NEXT: [[TMP7:%.*]] = trunc i32 [[TMP6]] to i8
-; VEC-NEXT: [[TMP8:%.*]] = getelementptr i8, i8* undef, i64 [[TMP0]]
-; VEC-NEXT: store i8 [[TMP7]], i8* [[TMP8]], align 1
+; VEC-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[WIDE_LOAD]], i32 0
+; VEC-NEXT: [[TMP7:%.*]] = zext i8 [[TMP6]] to i32
+; VEC-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i8
+; VEC-NEXT: [[TMP9:%.*]] = getelementptr i8, i8* undef, i64 [[TMP0]]
+; VEC-NEXT: store i8 [[TMP8]], i8* [[TMP9]], align 1
; VEC-NEXT: br label [[PRED_STORE_CONTINUE]]
; VEC: pred.store.continue:
-; VEC-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1
-; VEC-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]]
+; VEC-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1
+; VEC-NEXT: br i1 [[TMP10]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]]
; VEC: pred.store.if2:
-; VEC-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 1
-; VEC-NEXT: [[TMP11:%.*]] = extractelement <2 x i8> [[WIDE_LOAD]], i32 1
-; VEC-NEXT: [[TMP12:%.*]] = zext i8 [[TMP11]] to i32
-; VEC-NEXT: [[TMP13:%.*]] = trunc i32 [[TMP12]] to i8
-; VEC-NEXT: [[TMP14:%.*]] = getelementptr i8, i8* undef, i64 [[TMP10]]
-; VEC-NEXT: store i8 [[TMP13]], i8* [[TMP14]], align 1
+; VEC-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 1
+; VEC-NEXT: [[TMP12:%.*]] = extractelement <2 x i8> [[WIDE_LOAD]], i32 1
+; VEC-NEXT: [[TMP13:%.*]] = zext i8 [[TMP12]] to i32
+; VEC-NEXT: [[TMP14:%.*]] = trunc i32 [[TMP13]] to i8
+; VEC-NEXT: [[TMP15:%.*]] = getelementptr i8, i8* undef, i64 [[TMP11]]
+; VEC-NEXT: store i8 [[TMP14]], i8* [[TMP15]], align 1
; VEC-NEXT: br label [[PRED_STORE_CONTINUE3]]
; VEC: pred.store.continue3:
; VEC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; VEC-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef
-; VEC-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; VEC-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef
+; VEC-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; VEC: middle.block:
; VEC-NEXT: [[CMP_N:%.*]] = icmp eq i64 undef, undef
; VEC-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]]
; UNROLL-NOSIMPLIFY: vector.ph:
; UNROLL-NOSIMPLIFY-NEXT: br label [[VECTOR_BODY:%.*]]
; UNROLL-NOSIMPLIFY: vector.body:
-; UNROLL-NOSIMPLIFY-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ]
+; UNROLL-NOSIMPLIFY-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
; UNROLL-NOSIMPLIFY-NEXT: [[INDUCTION:%.*]] = add i64 [[INDEX]], 0
; UNROLL-NOSIMPLIFY-NEXT: [[INDUCTION2:%.*]] = add i64 [[INDEX]], 1
; UNROLL-NOSIMPLIFY-NEXT: [[TMP0:%.*]] = getelementptr i8, i8* [[PTR:%.*]], i64 [[INDUCTION]]
; UNROLL-NOSIMPLIFY-NEXT: store i8 [[TMP4]], i8* [[TMP0]], align 1
; UNROLL-NOSIMPLIFY-NEXT: br label [[PRED_STORE_CONTINUE]]
; UNROLL-NOSIMPLIFY: pred.store.continue:
-; UNROLL-NOSIMPLIFY-NEXT: br i1 [[C]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4]]
+; UNROLL-NOSIMPLIFY-NEXT: br i1 [[C]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]]
; UNROLL-NOSIMPLIFY: pred.store.if3:
; UNROLL-NOSIMPLIFY-NEXT: [[TMP5:%.*]] = load i8, i8* [[TMP1]], align 1
; UNROLL-NOSIMPLIFY-NEXT: [[TMP6:%.*]] = zext i8 [[TMP5]] to i32
; UNROLL-NOSIMPLIFY-NEXT: [[TMP7:%.*]] = trunc i32 [[TMP6]] to i8
; UNROLL-NOSIMPLIFY-NEXT: store i8 [[TMP7]], i8* [[TMP1]], align 1
-; UNROLL-NOSIMPLIFY-NEXT: br label [[PRED_STORE_CONTINUE4]]
+; UNROLL-NOSIMPLIFY-NEXT: br label [[PRED_STORE_CONTINUE6]]
; UNROLL-NOSIMPLIFY: pred.store.continue4:
; UNROLL-NOSIMPLIFY-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; UNROLL-NOSIMPLIFY-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
define void @int_iv_based_on_pointer_iv(i8* %A) {
; VF1-LABEL: @int_iv_based_on_pointer_iv(
-; VF1-NEXT: entry:
-; VF1-NEXT: [[SMIN:%.*]] = call i64 @llvm.smin.i64(i64 add (i64 ptrtoint (i32* @f to i64), i64 -4), i64 0)
-; VF1-NEXT: [[TMP0:%.*]] = sub i64 add (i64 ptrtoint (i32* @f to i64), i64 -1), [[SMIN]]
-; VF1-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 2
-; VF1-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; VF1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 2
-; VF1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; VF1: vector.ph:
-; VF1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 2
-; VF1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
-; VF1-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 4
-; VF1-NEXT: [[IND_END2:%.*]] = getelementptr i32, i32* null, i64 [[N_VEC]]
-; VF1-NEXT: br label [[VECTOR_BODY:%.*]]
; VF1: vector.body:
-; VF1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; VF1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
; VF1-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
; VF1-NEXT: [[INDUCTION:%.*]] = add i64 [[OFFSET_IDX]], 0
; VF1-NEXT: [[INDUCTION3:%.*]] = add i64 [[OFFSET_IDX]], 4
-; VF1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[INDUCTION]]
-; VF1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[INDUCTION3]]
-; VF1-NEXT: store i8 0, i8* [[TMP3]], align 1
-; VF1-NEXT: store i8 0, i8* [[TMP4]], align 1
+; VF1-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[INDUCTION]]
+; VF1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[INDUCTION3]]
+; VF1-NEXT: store i8 0, i8* [[TMP7]], align 1
+; VF1-NEXT: store i8 0, i8* [[TMP8]], align 1
; VF1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; VF1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; VF1-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; VF1: middle.block:
-; VF1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
-; VF1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; VF1: scalar.ph:
-; VF1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; VF1-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32* [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ null, [[ENTRY]] ]
-; VF1-NEXT: br label [[LOOP:%.*]]
-; VF1: loop:
-; VF1-NEXT: [[IV_INT:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_INT_NEXT:%.*]], [[LOOP]] ]
-; VF1-NEXT: [[IV_PTR:%.*]] = phi i32* [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV_PTR_NEXT:%.*]], [[LOOP]] ]
-; VF1-NEXT: [[IV_PTR_NEXT]] = getelementptr inbounds i32, i32* [[IV_PTR]], i64 1
-; VF1-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[IV_INT]]
-; VF1-NEXT: store i8 0, i8* [[GEP_A]], align 1
-; VF1-NEXT: [[IV_INT_NEXT]] = ptrtoint i32* [[IV_PTR_NEXT]] to i64
-; VF1-NEXT: [[SUB_PTR_SUB:%.*]] = sub i64 ptrtoint (i32* @f to i64), [[IV_INT_NEXT]]
-; VF1-NEXT: [[CMP:%.*]] = icmp sgt i64 [[SUB_PTR_SUB]], 0
-; VF1-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP2:![0-9]+]]
-; VF1: exit:
-; VF1-NEXT: ret void
+; VF1-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]],
+; VF1-NEXT: br i1 [[TMP13]], label %middle.block, label %vector.body
;
; VF2-LABEL: @int_iv_based_on_pointer_iv(
-; VF2-NEXT: entry:
-; VF2-NEXT: [[SMIN:%.*]] = call i64 @llvm.smin.i64(i64 add (i64 ptrtoint (i32* @f to i64), i64 -4), i64 0)
-; VF2-NEXT: [[TMP0:%.*]] = sub i64 add (i64 ptrtoint (i32* @f to i64), i64 -1), [[SMIN]]
-; VF2-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 2
-; VF2-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; VF2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 2
-; VF2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; VF2: vector.ph:
-; VF2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 2
-; VF2-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
-; VF2-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], 4
-; VF2-NEXT: [[IND_END2:%.*]] = getelementptr i32, i32* null, i64 [[N_VEC]]
-; VF2-NEXT: br label [[VECTOR_BODY:%.*]]
; VF2: vector.body:
-; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; VF2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
; VF2-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
; VF2-NEXT: [[TMP3:%.*]] = add i64 [[OFFSET_IDX]], 0
; VF2-NEXT: [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 4
-; VF2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[TMP3]]
-; VF2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[TMP4]]
-; VF2-NEXT: store i8 0, i8* [[TMP5]], align 1
-; VF2-NEXT: store i8 0, i8* [[TMP6]], align 1
+; VF2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[TMP3]]
+; VF2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[TMP4]]
+; VF2-NEXT: store i8 0, i8* [[TMP9]], align 1
+; VF2-NEXT: store i8 0, i8* [[TMP10]], align 1
; VF2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; VF2-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; VF2-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; VF2: middle.block:
-; VF2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
-; VF2-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; VF2: scalar.ph:
-; VF2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; VF2-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32* [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ null, [[ENTRY]] ]
-; VF2-NEXT: br label [[LOOP:%.*]]
-; VF2: loop:
-; VF2-NEXT: [[IV_INT:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_INT_NEXT:%.*]], [[LOOP]] ]
-; VF2-NEXT: [[IV_PTR:%.*]] = phi i32* [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV_PTR_NEXT:%.*]], [[LOOP]] ]
-; VF2-NEXT: [[IV_PTR_NEXT]] = getelementptr inbounds i32, i32* [[IV_PTR]], i64 1
-; VF2-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[IV_INT]]
-; VF2-NEXT: store i8 0, i8* [[GEP_A]], align 1
-; VF2-NEXT: [[IV_INT_NEXT]] = ptrtoint i32* [[IV_PTR_NEXT]] to i64
-; VF2-NEXT: [[SUB_PTR_SUB:%.*]] = sub i64 ptrtoint (i32* @f to i64), [[IV_INT_NEXT]]
-; VF2-NEXT: [[CMP:%.*]] = icmp sgt i64 [[SUB_PTR_SUB]], 0
-; VF2-NEXT: br i1 [[CMP]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP2:![0-9]+]]
-; VF2: exit:
-; VF2-NEXT: ret void
+; VF2-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]],
+; VF2-NEXT: br i1 [[TMP14]], label %middle.block, label %vector.body
;
entry:
br label %loop
; UNROLL: vector.ph:
; UNROLL-NEXT: [[N_VEC:%.*]] = and i32 [[SMAX]], 2147483644
; UNROLL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C:%.*]], i64 0
-; UNROLL-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <2 x i1> poison, i1 [[C]], i64 0
+; UNROLL-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <2 x i1> poison, i1 [[C]], i64 0
; UNROLL-NEXT: br label [[VECTOR_BODY:%.*]]
; UNROLL: vector.body:
-; UNROLL-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE10:%.*]] ]
-; UNROLL-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[PRED_UDIV_CONTINUE10]] ]
-; UNROLL-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP29:%.*]], [[PRED_UDIV_CONTINUE10]] ]
+; UNROLL-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE11:%.*]] ]
+; UNROLL-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[PRED_UDIV_CONTINUE11]] ]
+; UNROLL-NEXT: [[VEC_PHI2:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP29:%.*]], [[PRED_UDIV_CONTINUE11]] ]
; UNROLL-NEXT: [[TMP0:%.*]] = or i32 [[INDEX]], 2
; UNROLL-NEXT: [[TMP1:%.*]] = sext i32 [[INDEX]] to i64
; UNROLL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP1]]
; UNROLL-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4
; UNROLL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 2
; UNROLL-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <2 x i32>*
-; UNROLL-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP5]], align 4
+; UNROLL-NEXT: [[WIDE_LOAD3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP5]], align 4
; UNROLL-NEXT: br i1 [[C]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
; UNROLL: pred.udiv.if:
; UNROLL-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i64 0
; UNROLL-NEXT: br label [[PRED_UDIV_CONTINUE]]
; UNROLL: pred.udiv.continue:
; UNROLL-NEXT: [[TMP9:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_UDIV_IF]] ]
-; UNROLL-NEXT: br i1 [[C]], label [[PRED_UDIV_IF3:%.*]], label [[PRED_UDIV_CONTINUE4:%.*]]
+; UNROLL-NEXT: br i1 [[C]], label [[PRED_UDIV_IF4:%.*]], label [[PRED_UDIV_CONTINUE5:%.*]]
; UNROLL: pred.udiv.if3:
; UNROLL-NEXT: [[TMP10:%.*]] = or i32 [[INDEX]], 1
; UNROLL-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i64 1
; UNROLL-NEXT: [[TMP12:%.*]] = udiv i32 [[TMP11]], [[TMP10]]
; UNROLL-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP12]], i64 1
-; UNROLL-NEXT: br label [[PRED_UDIV_CONTINUE4]]
+; UNROLL-NEXT: br label [[PRED_UDIV_CONTINUE5]]
; UNROLL: pred.udiv.continue4:
-; UNROLL-NEXT: [[TMP14:%.*]] = phi <2 x i32> [ [[TMP9]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP13]], [[PRED_UDIV_IF3]] ]
-; UNROLL-NEXT: br i1 [[C]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8:%.*]]
+; UNROLL-NEXT: [[TMP14:%.*]] = phi <2 x i32> [ [[TMP9]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP13]], [[PRED_UDIV_IF4]] ]
+; UNROLL-NEXT: br i1 [[C]], label [[PRED_UDIV_IF8:%.*]], label [[PRED_UDIV_CONTINUE9:%.*]]
; UNROLL: pred.udiv.if7:
-; UNROLL-NEXT: [[TMP15:%.*]] = extractelement <2 x i32> [[WIDE_LOAD2]], i64 0
+; UNROLL-NEXT: [[TMP15:%.*]] = extractelement <2 x i32> [[WIDE_LOAD3]], i64 0
; UNROLL-NEXT: [[TMP16:%.*]] = udiv i32 [[TMP15]], [[TMP0]]
; UNROLL-NEXT: [[TMP17:%.*]] = insertelement <2 x i32> poison, i32 [[TMP16]], i64 0
-; UNROLL-NEXT: br label [[PRED_UDIV_CONTINUE8]]
+; UNROLL-NEXT: br label [[PRED_UDIV_CONTINUE9]]
; UNROLL: pred.udiv.continue8:
-; UNROLL-NEXT: [[TMP18:%.*]] = phi <2 x i32> [ poison, [[PRED_UDIV_CONTINUE4]] ], [ [[TMP17]], [[PRED_UDIV_IF7]] ]
-; UNROLL-NEXT: br i1 [[C]], label [[PRED_UDIV_IF9:%.*]], label [[PRED_UDIV_CONTINUE10]]
+; UNROLL-NEXT: [[TMP18:%.*]] = phi <2 x i32> [ poison, [[PRED_UDIV_CONTINUE5]] ], [ [[TMP17]], [[PRED_UDIV_IF8]] ]
+; UNROLL-NEXT: br i1 [[C]], label [[PRED_UDIV_IF10:%.*]], label [[PRED_UDIV_CONTINUE11]]
; UNROLL: pred.udiv.if9:
; UNROLL-NEXT: [[TMP19:%.*]] = or i32 [[INDEX]], 3
-; UNROLL-NEXT: [[TMP20:%.*]] = extractelement <2 x i32> [[WIDE_LOAD2]], i64 1
+; UNROLL-NEXT: [[TMP20:%.*]] = extractelement <2 x i32> [[WIDE_LOAD3]], i64 1
; UNROLL-NEXT: [[TMP21:%.*]] = udiv i32 [[TMP20]], [[TMP19]]
; UNROLL-NEXT: [[TMP22:%.*]] = insertelement <2 x i32> [[TMP18]], i32 [[TMP21]], i64 1
-; UNROLL-NEXT: br label [[PRED_UDIV_CONTINUE10]]
+; UNROLL-NEXT: br label [[PRED_UDIV_CONTINUE11]]
; UNROLL: pred.udiv.continue10:
-; UNROLL-NEXT: [[TMP23:%.*]] = phi <2 x i32> [ [[TMP18]], [[PRED_UDIV_CONTINUE8]] ], [ [[TMP22]], [[PRED_UDIV_IF9]] ]
+; UNROLL-NEXT: [[TMP23:%.*]] = phi <2 x i32> [ [[TMP18]], [[PRED_UDIV_CONTINUE9]] ], [ [[TMP22]], [[PRED_UDIV_IF10]] ]
; UNROLL-NEXT: [[TMP24:%.*]] = xor <2 x i1> [[BROADCAST_SPLATINSERT]], <i1 true, i1 poison>
; UNROLL-NEXT: [[TMP25:%.*]] = shufflevector <2 x i1> [[TMP24]], <2 x i1> poison, <2 x i32> zeroinitializer
-; UNROLL-NEXT: [[TMP26:%.*]] = xor <2 x i1> [[BROADCAST_SPLATINSERT5]], <i1 true, i1 poison>
+; UNROLL-NEXT: [[TMP26:%.*]] = xor <2 x i1> [[BROADCAST_SPLATINSERT6]], <i1 true, i1 poison>
; UNROLL-NEXT: [[TMP27:%.*]] = shufflevector <2 x i1> [[TMP26]], <2 x i1> poison, <2 x i32> zeroinitializer
; UNROLL-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP25]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP14]]
-; UNROLL-NEXT: [[PREDPHI11:%.*]] = select <2 x i1> [[TMP27]], <2 x i32> [[WIDE_LOAD2]], <2 x i32> [[TMP23]]
+; UNROLL-NEXT: [[PREDPHI12:%.*]] = select <2 x i1> [[TMP27]], <2 x i32> [[WIDE_LOAD3]], <2 x i32> [[TMP23]]
; UNROLL-NEXT: [[TMP28]] = add <2 x i32> [[PREDPHI]], [[VEC_PHI]]
-; UNROLL-NEXT: [[TMP29]] = add <2 x i32> [[PREDPHI11]], [[VEC_PHI1]]
+; UNROLL-NEXT: [[TMP29]] = add <2 x i32> [[PREDPHI12]], [[VEC_PHI2]]
; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; UNROLL-NEXT: [[TMP30:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; UNROLL-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
; UNROLL-NO-IC-NEXT: [[N_VEC:%.*]] = sub i32 [[SMAX]], [[N_MOD_VF]]
; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C:%.*]], i32 0
; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer
-; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <2 x i1> poison, i1 [[C]], i32 0
-; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT5]], <2 x i1> poison, <2 x i32> zeroinitializer
+; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <2 x i1> poison, i1 [[C]], i32 0
+; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT6]], <2 x i1> poison, <2 x i32> zeroinitializer
; UNROLL-NO-IC-NEXT: br label [[VECTOR_BODY:%.*]]
; UNROLL-NO-IC: vector.body:
-; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE10:%.*]] ]
-; UNROLL-NO-IC-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP32:%.*]], [[PRED_UDIV_CONTINUE10]] ]
-; UNROLL-NO-IC-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP33:%.*]], [[PRED_UDIV_CONTINUE10]] ]
+; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE11:%.*]] ]
+; UNROLL-NO-IC-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP32:%.*]], [[PRED_UDIV_CONTINUE11]] ]
+; UNROLL-NO-IC-NEXT: [[VEC_PHI2:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP33:%.*]], [[PRED_UDIV_CONTINUE11]] ]
; UNROLL-NO-IC-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
; UNROLL-NO-IC-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 2
; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]]
; UNROLL-NO-IC-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP5]], align 4
; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 2
; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <2 x i32>*
-; UNROLL-NO-IC-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP7]], align 4
+; UNROLL-NO-IC-NEXT: [[WIDE_LOAD3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP7]], align 4
; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0
; UNROLL-NO-IC-NEXT: br i1 [[TMP8]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
; UNROLL-NO-IC: pred.udiv.if:
; UNROLL-NO-IC: pred.udiv.continue:
; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP11]], [[PRED_UDIV_IF]] ]
; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1
-; UNROLL-NO-IC-NEXT: br i1 [[TMP13]], label [[PRED_UDIV_IF3:%.*]], label [[PRED_UDIV_CONTINUE4:%.*]]
+; UNROLL-NO-IC-NEXT: br i1 [[TMP13]], label [[PRED_UDIV_IF4:%.*]], label [[PRED_UDIV_CONTINUE5:%.*]]
; UNROLL-NO-IC: pred.udiv.if3:
; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = add i32 [[INDEX]], 1
; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1
; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = udiv i32 [[TMP15]], [[TMP14]]
; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = insertelement <2 x i32> [[TMP12]], i32 [[TMP16]], i32 1
-; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE4]]
+; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE5]]
; UNROLL-NO-IC: pred.udiv.continue4:
-; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = phi <2 x i32> [ [[TMP12]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP17]], [[PRED_UDIV_IF3]] ]
-; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT6]], i32 0
-; UNROLL-NO-IC-NEXT: br i1 [[TMP19]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8:%.*]]
+; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = phi <2 x i32> [ [[TMP12]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP17]], [[PRED_UDIV_IF4]] ]
+; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT7]], i32 0
+; UNROLL-NO-IC-NEXT: br i1 [[TMP19]], label [[PRED_UDIV_IF8:%.*]], label [[PRED_UDIV_CONTINUE9:%.*]]
; UNROLL-NO-IC: pred.udiv.if7:
-; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = extractelement <2 x i32> [[WIDE_LOAD2]], i32 0
+; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = extractelement <2 x i32> [[WIDE_LOAD3]], i32 0
; UNROLL-NO-IC-NEXT: [[TMP21:%.*]] = udiv i32 [[TMP20]], [[TMP1]]
; UNROLL-NO-IC-NEXT: [[TMP22:%.*]] = insertelement <2 x i32> poison, i32 [[TMP21]], i32 0
-; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE8]]
+; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE9]]
; UNROLL-NO-IC: pred.udiv.continue8:
-; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = phi <2 x i32> [ poison, [[PRED_UDIV_CONTINUE4]] ], [ [[TMP22]], [[PRED_UDIV_IF7]] ]
-; UNROLL-NO-IC-NEXT: [[TMP24:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT6]], i32 1
-; UNROLL-NO-IC-NEXT: br i1 [[TMP24]], label [[PRED_UDIV_IF9:%.*]], label [[PRED_UDIV_CONTINUE10]]
+; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = phi <2 x i32> [ poison, [[PRED_UDIV_CONTINUE5]] ], [ [[TMP22]], [[PRED_UDIV_IF8]] ]
+; UNROLL-NO-IC-NEXT: [[TMP24:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT7]], i32 1
+; UNROLL-NO-IC-NEXT: br i1 [[TMP24]], label [[PRED_UDIV_IF10:%.*]], label [[PRED_UDIV_CONTINUE11]]
; UNROLL-NO-IC: pred.udiv.if9:
; UNROLL-NO-IC-NEXT: [[TMP25:%.*]] = add i32 [[INDEX]], 3
-; UNROLL-NO-IC-NEXT: [[TMP26:%.*]] = extractelement <2 x i32> [[WIDE_LOAD2]], i32 1
+; UNROLL-NO-IC-NEXT: [[TMP26:%.*]] = extractelement <2 x i32> [[WIDE_LOAD3]], i32 1
; UNROLL-NO-IC-NEXT: [[TMP27:%.*]] = udiv i32 [[TMP26]], [[TMP25]]
; UNROLL-NO-IC-NEXT: [[TMP28:%.*]] = insertelement <2 x i32> [[TMP23]], i32 [[TMP27]], i32 1
-; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE10]]
+; UNROLL-NO-IC-NEXT: br label [[PRED_UDIV_CONTINUE11]]
; UNROLL-NO-IC: pred.udiv.continue10:
-; UNROLL-NO-IC-NEXT: [[TMP29:%.*]] = phi <2 x i32> [ [[TMP23]], [[PRED_UDIV_CONTINUE8]] ], [ [[TMP28]], [[PRED_UDIV_IF9]] ]
+; UNROLL-NO-IC-NEXT: [[TMP29:%.*]] = phi <2 x i32> [ [[TMP23]], [[PRED_UDIV_CONTINUE9]] ], [ [[TMP28]], [[PRED_UDIV_IF10]] ]
; UNROLL-NO-IC-NEXT: [[TMP30:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT]], <i1 true, i1 true>
-; UNROLL-NO-IC-NEXT: [[TMP31:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT6]], <i1 true, i1 true>
+; UNROLL-NO-IC-NEXT: [[TMP31:%.*]] = xor <2 x i1> [[BROADCAST_SPLAT7]], <i1 true, i1 true>
; UNROLL-NO-IC-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP30]], <2 x i32> [[WIDE_LOAD]], <2 x i32> [[TMP18]]
-; UNROLL-NO-IC-NEXT: [[PREDPHI11:%.*]] = select <2 x i1> [[TMP31]], <2 x i32> [[WIDE_LOAD2]], <2 x i32> [[TMP29]]
+; UNROLL-NO-IC-NEXT: [[PREDPHI12:%.*]] = select <2 x i1> [[TMP31]], <2 x i32> [[WIDE_LOAD3]], <2 x i32> [[TMP29]]
; UNROLL-NO-IC-NEXT: [[TMP32]] = add <2 x i32> [[PREDPHI]], [[VEC_PHI]]
-; UNROLL-NO-IC-NEXT: [[TMP33]] = add <2 x i32> [[PREDPHI11]], [[VEC_PHI1]]
+; UNROLL-NO-IC-NEXT: [[TMP33]] = add <2 x i32> [[PREDPHI12]], [[VEC_PHI2]]
; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; UNROLL-NO-IC-NEXT: [[TMP34:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; UNROLL-NO-IC-NEXT: br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
; INTERLEAVE: vector.ph:
; INTERLEAVE-NEXT: [[N_VEC:%.*]] = and i32 [[SMAX]], 2147483640
; INTERLEAVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C:%.*]], i64 0
-; INTERLEAVE-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <4 x i1> poison, i1 [[C]], i64 0
+; INTERLEAVE-NEXT: [[BROADCAST_SPLATINSERT10:%.*]] = insertelement <4 x i1> poison, i1 [[C]], i64 0
; INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]]
; INTERLEAVE: vector.body:
-; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE18:%.*]] ]
-; INTERLEAVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP48:%.*]], [[PRED_UDIV_CONTINUE18]] ]
-; INTERLEAVE-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP49:%.*]], [[PRED_UDIV_CONTINUE18]] ]
+; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE19:%.*]] ]
+; INTERLEAVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP48:%.*]], [[PRED_UDIV_CONTINUE19]] ]
+; INTERLEAVE-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP49:%.*]], [[PRED_UDIV_CONTINUE19]] ]
; INTERLEAVE-NEXT: [[TMP0:%.*]] = or i32 [[INDEX]], 4
; INTERLEAVE-NEXT: [[TMP1:%.*]] = sext i32 [[INDEX]] to i64
; INTERLEAVE-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP1]]
; INTERLEAVE-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
; INTERLEAVE-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 4
; INTERLEAVE-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
-; INTERLEAVE-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4
+; INTERLEAVE-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4
; INTERLEAVE-NEXT: br i1 [[C]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
; INTERLEAVE: pred.udiv.if:
; INTERLEAVE-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 0
; INTERLEAVE-NEXT: br label [[PRED_UDIV_CONTINUE]]
; INTERLEAVE: pred.udiv.continue:
; INTERLEAVE-NEXT: [[TMP9:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_UDIV_IF]] ]
-; INTERLEAVE-NEXT: br i1 [[C]], label [[PRED_UDIV_IF3:%.*]], label [[PRED_UDIV_CONTINUE4:%.*]]
+; INTERLEAVE-NEXT: br i1 [[C]], label [[PRED_UDIV_IF4:%.*]], label [[PRED_UDIV_CONTINUE5:%.*]]
; INTERLEAVE: pred.udiv.if3:
; INTERLEAVE-NEXT: [[TMP10:%.*]] = or i32 [[INDEX]], 1
; INTERLEAVE-NEXT: [[TMP11:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 1
; INTERLEAVE-NEXT: [[TMP12:%.*]] = udiv i32 [[TMP11]], [[TMP10]]
; INTERLEAVE-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP12]], i64 1
-; INTERLEAVE-NEXT: br label [[PRED_UDIV_CONTINUE4]]
+; INTERLEAVE-NEXT: br label [[PRED_UDIV_CONTINUE5]]
; INTERLEAVE: pred.udiv.continue4:
-; INTERLEAVE-NEXT: [[TMP14:%.*]] = phi <4 x i32> [ [[TMP9]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP13]], [[PRED_UDIV_IF3]] ]
-; INTERLEAVE-NEXT: br i1 [[C]], label [[PRED_UDIV_IF5:%.*]], label [[PRED_UDIV_CONTINUE6:%.*]]
+; INTERLEAVE-NEXT: [[TMP14:%.*]] = phi <4 x i32> [ [[TMP9]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP13]], [[PRED_UDIV_IF4]] ]
+; INTERLEAVE-NEXT: br i1 [[C]], label [[PRED_UDIV_IF6:%.*]], label [[PRED_UDIV_CONTINUE7:%.*]]
; INTERLEAVE: pred.udiv.if5:
; INTERLEAVE-NEXT: [[TMP15:%.*]] = or i32 [[INDEX]], 2
; INTERLEAVE-NEXT: [[TMP16:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 2
; INTERLEAVE-NEXT: [[TMP17:%.*]] = udiv i32 [[TMP16]], [[TMP15]]
; INTERLEAVE-NEXT: [[TMP18:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP17]], i64 2
-; INTERLEAVE-NEXT: br label [[PRED_UDIV_CONTINUE6]]
+; INTERLEAVE-NEXT: br label [[PRED_UDIV_CONTINUE7]]
; INTERLEAVE: pred.udiv.continue6:
-; INTERLEAVE-NEXT: [[TMP19:%.*]] = phi <4 x i32> [ [[TMP14]], [[PRED_UDIV_CONTINUE4]] ], [ [[TMP18]], [[PRED_UDIV_IF5]] ]
-; INTERLEAVE-NEXT: br i1 [[C]], label [[PRED_UDIV_IF7:%.*]], label [[PRED_UDIV_CONTINUE8:%.*]]
+; INTERLEAVE-NEXT: [[TMP19:%.*]] = phi <4 x i32> [ [[TMP14]], [[PRED_UDIV_CONTINUE5]] ], [ [[TMP18]], [[PRED_UDIV_IF6]] ]
+; INTERLEAVE-NEXT: br i1 [[C]], label [[PRED_UDIV_IF8:%.*]], label [[PRED_UDIV_CONTINUE9:%.*]]
; INTERLEAVE: pred.udiv.if7:
; INTERLEAVE-NEXT: [[TMP20:%.*]] = or i32 [[INDEX]], 3
; INTERLEAVE-NEXT: [[TMP21:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i64 3
; INTERLEAVE-NEXT: [[TMP22:%.*]] = udiv i32 [[TMP21]], [[TMP20]]
; INTERLEAVE-NEXT: [[TMP23:%.*]] = insertelement <4 x i32> [[TMP19]], i32 [[TMP22]], i64 3
-; INTERLEAVE-NEXT: br label [[PRED_UDIV_CONTINUE8]]
+; INTERLEAVE-NEXT: br label [[PRED_UDIV_CONTINUE9]]
; INTERLEAVE: pred.udiv.continue8:
-; INTERLEAVE-NEXT: [[TMP24:%.*]] = phi <4 x i32> [ [[TMP19]], [[PRED_UDIV_CONTINUE6]] ], [ [[TMP23]], [[PRED_UDIV_IF7]] ]
-; INTERLEAVE-NEXT: br i1 [[C]], label [[PRED_UDIV_IF11:%.*]], label [[PRED_UDIV_CONTINUE12:%.*]]
+; INTERLEAVE-NEXT: [[TMP24:%.*]] = phi <4 x i32> [ [[TMP19]], [[PRED_UDIV_CONTINUE7]] ], [ [[TMP23]], [[PRED_UDIV_IF8]] ]
+; INTERLEAVE-NEXT: br i1 [[C]], label [[PRED_UDIV_IF12:%.*]], label [[PRED_UDIV_CONTINUE13:%.*]]
; INTERLEAVE: pred.udiv.if11:
-; INTERLEAVE-NEXT: [[TMP25:%.*]] = extractelement <4 x i32> [[WIDE_LOAD2]], i64 0
+; INTERLEAVE-NEXT: [[TMP25:%.*]] = extractelement <4 x i32> [[WIDE_LOAD3]], i64 0
; INTERLEAVE-NEXT: [[TMP26:%.*]] = udiv i32 [[TMP25]], [[TMP0]]
; INTERLEAVE-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> poison, i32 [[TMP26]], i64 0
-; INTERLEAVE-NEXT: br label [[PRED_UDIV_CONTINUE12]]
+; INTERLEAVE-NEXT: br label [[PRED_UDIV_CONTINUE13]]
; INTERLEAVE: pred.udiv.continue12:
-; INTERLEAVE-NEXT: [[TMP28:%.*]] = phi <4 x i32> [ poison, [[PRED_UDIV_CONTINUE8]] ], [ [[TMP27]], [[PRED_UDIV_IF11]] ]
-; INTERLEAVE-NEXT: br i1 [[C]], label [[PRED_UDIV_IF13:%.*]], label [[PRED_UDIV_CONTINUE14:%.*]]
+; INTERLEAVE-NEXT: [[TMP28:%.*]] = phi <4 x i32> [ poison, [[PRED_UDIV_CONTINUE9]] ], [ [[TMP27]], [[PRED_UDIV_IF12]] ]
+; INTERLEAVE-NEXT: br i1 [[C]], label [[PRED_UDIV_IF14:%.*]], label [[PRED_UDIV_CONTINUE15:%.*]]
; INTERLEAVE: pred.udiv.if13:
; INTERLEAVE-NEXT: [[TMP29:%.*]] = or i32 [[INDEX]], 5
-; INTERLEAVE-NEXT: [[TMP30:%.*]] = extractelement <4 x i32> [[WIDE_LOAD2]], i64 1
+; INTERLEAVE-NEXT: [[TMP30:%.*]] = extractelement <4 x i32> [[WIDE_LOAD3]], i64 1
; INTERLEAVE-NEXT: [[TMP31:%.*]] = udiv i32 [[TMP30]], [[TMP29]]
; INTERLEAVE-NEXT: [[TMP32:%.*]] = insertelement <4 x i32> [[TMP28]], i32 [[TMP31]], i64 1
-; INTERLEAVE-NEXT: br label [[PRED_UDIV_CONTINUE14]]
+; INTERLEAVE-NEXT: br label [[PRED_UDIV_CONTINUE15]]
; INTERLEAVE: pred.udiv.continue14:
-; INTERLEAVE-NEXT: [[TMP33:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_UDIV_CONTINUE12]] ], [ [[TMP32]], [[PRED_UDIV_IF13]] ]
-; INTERLEAVE-NEXT: br i1 [[C]], label [[PRED_UDIV_IF15:%.*]], label [[PRED_UDIV_CONTINUE16:%.*]]
+; INTERLEAVE-NEXT: [[TMP33:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_UDIV_CONTINUE13]] ], [ [[TMP32]], [[PRED_UDIV_IF14]] ]
+; INTERLEAVE-NEXT: br i1 [[C]], label [[PRED_UDIV_IF16:%.*]], label [[PRED_UDIV_CONTINUE17:%.*]]
; INTERLEAVE: pred.udiv.if15:
; INTERLEAVE-NEXT: [[TMP34:%.*]] = or i32 [[INDEX]], 6
-; INTERLEAVE-NEXT: [[TMP35:%.*]] = extractelement <4 x i32> [[WIDE_LOAD2]], i64 2
+; INTERLEAVE-NEXT: [[TMP35:%.*]] = extractelement <4 x i32> [[WIDE_LOAD3]], i64 2
; INTERLEAVE-NEXT: [[TMP36:%.*]] = udiv i32 [[TMP35]], [[TMP34]]
; INTERLEAVE-NEXT: [[TMP37:%.*]] = insertelement <4 x i32> [[TMP33]], i32 [[TMP36]], i64 2
-; INTERLEAVE-NEXT: br label [[PRED_UDIV_CONTINUE16]]
+; INTERLEAVE-NEXT: br label [[PRED_UDIV_CONTINUE17]]
; INTERLEAVE: pred.udiv.continue16:
-; INTERLEAVE-NEXT: [[TMP38:%.*]] = phi <4 x i32> [ [[TMP33]], [[PRED_UDIV_CONTINUE14]] ], [ [[TMP37]], [[PRED_UDIV_IF15]] ]
-; INTERLEAVE-NEXT: br i1 [[C]], label [[PRED_UDIV_IF17:%.*]], label [[PRED_UDIV_CONTINUE18]]
+; INTERLEAVE-NEXT: [[TMP38:%.*]] = phi <4 x i32> [ [[TMP33]], [[PRED_UDIV_CONTINUE15]] ], [ [[TMP37]], [[PRED_UDIV_IF16]] ]
+; INTERLEAVE-NEXT: br i1 [[C]], label [[PRED_UDIV_IF18:%.*]], label [[PRED_UDIV_CONTINUE19]]
; INTERLEAVE: pred.udiv.if17:
; INTERLEAVE-NEXT: [[TMP39:%.*]] = or i32 [[INDEX]], 7
-; INTERLEAVE-NEXT: [[TMP40:%.*]] = extractelement <4 x i32> [[WIDE_LOAD2]], i64 3
+; INTERLEAVE-NEXT: [[TMP40:%.*]] = extractelement <4 x i32> [[WIDE_LOAD3]], i64 3
; INTERLEAVE-NEXT: [[TMP41:%.*]] = udiv i32 [[TMP40]], [[TMP39]]
; INTERLEAVE-NEXT: [[TMP42:%.*]] = insertelement <4 x i32> [[TMP38]], i32 [[TMP41]], i64 3
-; INTERLEAVE-NEXT: br label [[PRED_UDIV_CONTINUE18]]
+; INTERLEAVE-NEXT: br label [[PRED_UDIV_CONTINUE19]]
; INTERLEAVE: pred.udiv.continue18:
-; INTERLEAVE-NEXT: [[TMP43:%.*]] = phi <4 x i32> [ [[TMP38]], [[PRED_UDIV_CONTINUE16]] ], [ [[TMP42]], [[PRED_UDIV_IF17]] ]
+; INTERLEAVE-NEXT: [[TMP43:%.*]] = phi <4 x i32> [ [[TMP38]], [[PRED_UDIV_CONTINUE17]] ], [ [[TMP42]], [[PRED_UDIV_IF18]] ]
; INTERLEAVE-NEXT: [[TMP44:%.*]] = xor <4 x i1> [[BROADCAST_SPLATINSERT]], <i1 true, i1 poison, i1 poison, i1 poison>
; INTERLEAVE-NEXT: [[TMP45:%.*]] = shufflevector <4 x i1> [[TMP44]], <4 x i1> poison, <4 x i32> zeroinitializer
-; INTERLEAVE-NEXT: [[TMP46:%.*]] = xor <4 x i1> [[BROADCAST_SPLATINSERT9]], <i1 true, i1 poison, i1 poison, i1 poison>
+; INTERLEAVE-NEXT: [[TMP46:%.*]] = xor <4 x i1> [[BROADCAST_SPLATINSERT10]], <i1 true, i1 poison, i1 poison, i1 poison>
; INTERLEAVE-NEXT: [[TMP47:%.*]] = shufflevector <4 x i1> [[TMP46]], <4 x i1> poison, <4 x i32> zeroinitializer
; INTERLEAVE-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP45]], <4 x i32> [[WIDE_LOAD]], <4 x i32> [[TMP24]]
-; INTERLEAVE-NEXT: [[PREDPHI19:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_LOAD2]], <4 x i32> [[TMP43]]
+; INTERLEAVE-NEXT: [[PREDPHI20:%.*]] = select <4 x i1> [[TMP47]], <4 x i32> [[WIDE_LOAD3]], <4 x i32> [[TMP43]]
; INTERLEAVE-NEXT: [[TMP48]] = add <4 x i32> [[PREDPHI]], [[VEC_PHI]]
-; INTERLEAVE-NEXT: [[TMP49]] = add <4 x i32> [[PREDPHI19]], [[VEC_PHI1]]
+; INTERLEAVE-NEXT: [[TMP49]] = add <4 x i32> [[PREDPHI20]], [[VEC_PHI2]]
; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
; INTERLEAVE-NEXT: [[TMP50:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; INTERLEAVE-NEXT: br i1 [[TMP50]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND1:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
+; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND1]]
; CHECK-NEXT: [[TMP6:%.*]] = trunc <2 x i32> [[TMP5]] to <2 x i16>
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], %pair.i16* [[P:%.*]], i64 [[TMP3]], i32 1
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I16]], %pair.i16* [[P]], i64 [[TMP4]], i32 1
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i16> [[TMP6]], i32 1
; CHECK-NEXT: store i16 [[TMP10]], i16* [[TMP8]], align 2
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
+; CHECK-NEXT: [[VEC_IND_NEXT2]] = add <2 x i32> [[VEC_IND1]], <i32 2, i32 2>
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
; CHECK: middle.block:
; IND-NEXT: br label [[VECTOR_BODY:%.*]]
; IND: vector.body:
; IND-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IND-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IND-NEXT: [[VEC_IND1:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
; IND-NEXT: [[TMP3:%.*]] = or i64 [[INDEX]], 1
-; IND-NEXT: [[TMP4:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
+; IND-NEXT: [[TMP4:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND1]]
; IND-NEXT: [[TMP5:%.*]] = trunc <2 x i32> [[TMP4]] to <2 x i16>
; IND-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], %pair.i16* [[P:%.*]], i64 [[INDEX]], i32 1
; IND-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I16]], %pair.i16* [[P]], i64 [[TMP3]], i32 1
; IND-NEXT: [[TMP9:%.*]] = extractelement <2 x i16> [[TMP5]], i64 1
; IND-NEXT: store i16 [[TMP9]], i16* [[TMP7]], align 2
; IND-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; IND-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
+; IND-NEXT: [[VEC_IND_NEXT2]] = add <2 x i32> [[VEC_IND1]], <i32 2, i32 2>
; IND-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; IND-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
; IND: middle.block:
; UNROLL-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934588
; UNROLL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[A:%.*]], i64 0
; UNROLL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; UNROLL-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i32> poison, i32 [[A]], i64 0
+; UNROLL-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <2 x i32> poison, i32 [[A]], i64 0
; UNROLL-NEXT: br label [[VECTOR_BODY:%.*]]
; UNROLL: vector.body:
; UNROLL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NEXT: [[VEC_IND2:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT5:%.*]], [[VECTOR_BODY]] ]
; UNROLL-NEXT: [[TMP3:%.*]] = or i64 [[INDEX]], 1
; UNROLL-NEXT: [[TMP4:%.*]] = or i64 [[INDEX]], 2
; UNROLL-NEXT: [[TMP5:%.*]] = or i64 [[INDEX]], 3
-; UNROLL-NEXT: [[TMP6:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
-; UNROLL-NEXT: [[TMP7:%.*]] = add <2 x i32> [[BROADCAST_SPLATINSERT2]], <i32 2, i32 poison>
+; UNROLL-NEXT: [[TMP6:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND2]]
+; UNROLL-NEXT: [[TMP7:%.*]] = add <2 x i32> [[BROADCAST_SPLATINSERT6]], <i32 2, i32 poison>
; UNROLL-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <2 x i32> zeroinitializer
-; UNROLL-NEXT: [[TMP9:%.*]] = add <2 x i32> [[TMP8]], [[VEC_IND]]
+; UNROLL-NEXT: [[TMP9:%.*]] = add <2 x i32> [[TMP8]], [[VEC_IND2]]
; UNROLL-NEXT: [[TMP10:%.*]] = trunc <2 x i32> [[TMP6]] to <2 x i16>
; UNROLL-NEXT: [[TMP11:%.*]] = trunc <2 x i32> [[TMP9]] to <2 x i16>
; UNROLL-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], %pair.i16* [[P:%.*]], i64 [[INDEX]], i32 1
; UNROLL-NEXT: [[TMP19:%.*]] = extractelement <2 x i16> [[TMP11]], i64 1
; UNROLL-NEXT: store i16 [[TMP19]], i16* [[TMP15]], align 2
; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; UNROLL-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 4, i32 4>
+; UNROLL-NEXT: [[VEC_IND_NEXT5]] = add <2 x i32> [[VEC_IND2]], <i32 4, i32 4>
; UNROLL-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; UNROLL-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
; UNROLL: middle.block:
; UNROLL-NO-IC-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[A:%.*]], i32 0
; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i32> poison, i32 [[A]], i32 0
-; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT2]], <2 x i32> poison, <2 x i32> zeroinitializer
+; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <2 x i32> poison, i32 [[A]], i32 0
+; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT6]], <2 x i32> poison, <2 x i32> zeroinitializer
; UNROLL-NO-IC-NEXT: br label [[VECTOR_BODY:%.*]]
; UNROLL-NO-IC: vector.body:
; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT: [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
+; UNROLL-NO-IC-NEXT: [[VEC_IND2:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT5:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-IC-NEXT: [[STEP_ADD3:%.*]] = add <2 x i32> [[VEC_IND2]], <i32 2, i32 2>
; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 1
; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 2
; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 3
-; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
-; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = add <2 x i32> [[BROADCAST_SPLAT3]], [[STEP_ADD]]
+; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND2]]
+; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = add <2 x i32> [[BROADCAST_SPLAT7]], [[STEP_ADD3]]
; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = trunc <2 x i32> [[TMP7]] to <2 x i16>
; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = trunc <2 x i32> [[TMP8]] to <2 x i16>
; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], %pair.i16* [[P:%.*]], i64 [[TMP3]], i32 1
; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = extractelement <2 x i16> [[TMP10]], i32 1
; UNROLL-NO-IC-NEXT: store i16 [[TMP18]], i16* [[TMP14]], align 2
; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], <i32 2, i32 2>
+; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT5]] = add <2 x i32> [[STEP_ADD3]], <i32 2, i32 2>
; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; UNROLL-NO-IC-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
; UNROLL-NO-IC: middle.block:
; INTERLEAVE-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934584
; INTERLEAVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[A:%.*]], i64 0
; INTERLEAVE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; INTERLEAVE-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
+; INTERLEAVE-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i64 0
; INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]]
; INTERLEAVE: vector.body:
; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; INTERLEAVE-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; INTERLEAVE-NEXT: [[VEC_IND2:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT5:%.*]], [[VECTOR_BODY]] ]
; INTERLEAVE-NEXT: [[TMP3:%.*]] = or i64 [[INDEX]], 1
; INTERLEAVE-NEXT: [[TMP4:%.*]] = or i64 [[INDEX]], 2
; INTERLEAVE-NEXT: [[TMP5:%.*]] = or i64 [[INDEX]], 3
; INTERLEAVE-NEXT: [[TMP7:%.*]] = or i64 [[INDEX]], 5
; INTERLEAVE-NEXT: [[TMP8:%.*]] = or i64 [[INDEX]], 6
; INTERLEAVE-NEXT: [[TMP9:%.*]] = or i64 [[INDEX]], 7
-; INTERLEAVE-NEXT: [[TMP10:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
-; INTERLEAVE-NEXT: [[TMP11:%.*]] = add <4 x i32> [[BROADCAST_SPLATINSERT2]], <i32 4, i32 poison, i32 poison, i32 poison>
+; INTERLEAVE-NEXT: [[TMP10:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], [[VEC_IND2]]
+; INTERLEAVE-NEXT: [[TMP11:%.*]] = add <4 x i32> [[BROADCAST_SPLATINSERT6]], <i32 4, i32 poison, i32 poison, i32 poison>
; INTERLEAVE-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> poison, <4 x i32> zeroinitializer
-; INTERLEAVE-NEXT: [[TMP13:%.*]] = add <4 x i32> [[TMP12]], [[VEC_IND]]
+; INTERLEAVE-NEXT: [[TMP13:%.*]] = add <4 x i32> [[TMP12]], [[VEC_IND2]]
; INTERLEAVE-NEXT: [[TMP14:%.*]] = trunc <4 x i32> [[TMP10]] to <4 x i16>
; INTERLEAVE-NEXT: [[TMP15:%.*]] = trunc <4 x i32> [[TMP13]] to <4 x i16>
; INTERLEAVE-NEXT: [[TMP16:%.*]] = getelementptr inbounds [[PAIR_I16:%.*]], %pair.i16* [[P:%.*]], i64 [[INDEX]], i32 1
; INTERLEAVE-NEXT: [[TMP31:%.*]] = extractelement <4 x i16> [[TMP15]], i64 3
; INTERLEAVE-NEXT: store i16 [[TMP31]], i16* [[TMP23]], align 2
; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; INTERLEAVE-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8>
+; INTERLEAVE-NEXT: [[VEC_IND_NEXT5]] = add <4 x i32> [[VEC_IND2]], <i32 8, i32 8, i32 8, i32 8>
; INTERLEAVE-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; INTERLEAVE-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
; INTERLEAVE: middle.block:
; CHECK-NEXT: [[CAST_CRD:%.*]] = trunc i32 [[N_VEC]] to i8
; CHECK-NEXT: [[IND_END:%.*]] = add i8 [[T]], [[CAST_CRD]]
; CHECK-NEXT: [[IND_END3:%.*]] = add i32 [[EXT]], [[N_VEC]]
-; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT]], i32 0
-; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[INDUCTION:%.*]] = add <2 x i32> [[DOTSPLAT]], <i32 0, i32 1>
+; CHECK-NEXT: [[DOTSPLATINSERT4:%.*]] = insertelement <2 x i32> poison, i32 [[EXT]], i32 0
+; CHECK-NEXT: [[DOTSPLAT5:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT4]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[INDUCTION6:%.*]] = add <2 x i32> [[DOTSPLAT5]], <i32 0, i32 1>
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND7:%.*]] = phi <2 x i32> [ [[INDUCTION6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT8:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP12:%.*]] = trunc i32 [[INDEX]] to i8
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[T]], [[TMP12]]
; CHECK-NEXT: [[TMP13:%.*]] = add i8 [[OFFSET_IDX]], 0
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i8 [[TMP13]]
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i32 0
; CHECK-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <2 x i32>*
-; CHECK-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP16]], align 4
+; CHECK-NEXT: store <2 x i32> [[VEC_IND7]], <2 x i32>* [[TMP16]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
+; CHECK-NEXT: [[VEC_IND_NEXT8]] = add <2 x i32> [[VEC_IND7]], <i32 2, i32 2>
; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
; CHECK: middle.block:
; IND-NEXT: [[CAST_CRD:%.*]] = trunc i32 [[N_VEC]] to i8
; IND-NEXT: [[IND_END:%.*]] = add i8 [[CAST_CRD]], [[T]]
; IND-NEXT: [[IND_END3:%.*]] = add i32 [[N_VEC]], [[EXT]]
-; IND-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT]], i64 0
-; IND-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; IND-NEXT: [[INDUCTION:%.*]] = add nuw nsw <2 x i32> [[DOTSPLAT]], <i32 0, i32 1>
+; IND-NEXT: [[DOTSPLATINSERT4:%.*]] = insertelement <2 x i32> poison, i32 [[EXT]], i64 0
+; IND-NEXT: [[DOTSPLAT5:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT4]], <2 x i32> poison, <2 x i32> zeroinitializer
+; IND-NEXT: [[INDUCTION6:%.*]] = add nuw nsw <2 x i32> [[DOTSPLAT5]], <i32 0, i32 1>
; IND-NEXT: br label [[VECTOR_BODY:%.*]]
; IND: vector.body:
; IND-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IND-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IND-NEXT: [[VEC_IND7:%.*]] = phi <2 x i32> [ [[INDUCTION6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT8:%.*]], [[VECTOR_BODY]] ]
; IND-NEXT: [[TMP12:%.*]] = trunc i32 [[INDEX]] to i8
; IND-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[TMP12]], [[T]]
; IND-NEXT: [[TMP13:%.*]] = sext i8 [[OFFSET_IDX]] to i64
; IND-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP13]]
; IND-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <2 x i32>*
-; IND-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP15]], align 4
+; IND-NEXT: store <2 x i32> [[VEC_IND7]], <2 x i32>* [[TMP15]], align 4
; IND-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; IND-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
+; IND-NEXT: [[VEC_IND_NEXT8]] = add <2 x i32> [[VEC_IND7]], <i32 2, i32 2>
; IND-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; IND-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
; IND: middle.block:
; UNROLL-NEXT: [[CAST_CRD:%.*]] = trunc i32 [[N_VEC]] to i8
; UNROLL-NEXT: [[IND_END:%.*]] = add i8 [[CAST_CRD]], [[T]]
; UNROLL-NEXT: [[IND_END3:%.*]] = add i32 [[N_VEC]], [[EXT]]
-; UNROLL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT]], i64 0
-; UNROLL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; UNROLL-NEXT: [[INDUCTION:%.*]] = add nuw nsw <2 x i32> [[DOTSPLAT]], <i32 0, i32 1>
+; UNROLL-NEXT: [[DOTSPLATINSERT5:%.*]] = insertelement <2 x i32> poison, i32 [[EXT]], i64 0
+; UNROLL-NEXT: [[DOTSPLAT6:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT5]], <2 x i32> poison, <2 x i32> zeroinitializer
+; UNROLL-NEXT: [[INDUCTION7:%.*]] = add nuw nsw <2 x i32> [[DOTSPLAT6]], <i32 0, i32 1>
; UNROLL-NEXT: br label [[VECTOR_BODY:%.*]]
; UNROLL: vector.body:
; UNROLL-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NEXT: [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
+; UNROLL-NEXT: [[VEC_IND8:%.*]] = phi <2 x i32> [ [[INDUCTION7]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT11:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NEXT: [[STEP_ADD9:%.*]] = add <2 x i32> [[VEC_IND8]], <i32 2, i32 2>
; UNROLL-NEXT: [[TMP12:%.*]] = trunc i32 [[INDEX]] to i8
; UNROLL-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[TMP12]], [[T]]
; UNROLL-NEXT: [[TMP13:%.*]] = sext i8 [[OFFSET_IDX]] to i64
; UNROLL-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP13]]
; UNROLL-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <2 x i32>*
-; UNROLL-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP15]], align 4
+; UNROLL-NEXT: store <2 x i32> [[VEC_IND8]], <2 x i32>* [[TMP15]], align 4
; UNROLL-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i64 2
; UNROLL-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <2 x i32>*
-; UNROLL-NEXT: store <2 x i32> [[STEP_ADD]], <2 x i32>* [[TMP17]], align 4
+; UNROLL-NEXT: store <2 x i32> [[STEP_ADD9]], <2 x i32>* [[TMP17]], align 4
; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; UNROLL-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 4, i32 4>
+; UNROLL-NEXT: [[VEC_IND_NEXT11]] = add <2 x i32> [[VEC_IND8]], <i32 4, i32 4>
; UNROLL-NEXT: [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; UNROLL-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
; UNROLL: middle.block:
; UNROLL-NO-IC-NEXT: [[CAST_CRD:%.*]] = trunc i32 [[N_VEC]] to i8
; UNROLL-NO-IC-NEXT: [[IND_END:%.*]] = add i8 [[T]], [[CAST_CRD]]
; UNROLL-NO-IC-NEXT: [[IND_END3:%.*]] = add i32 [[EXT]], [[N_VEC]]
-; UNROLL-NO-IC-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT]], i32 0
-; UNROLL-NO-IC-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; UNROLL-NO-IC-NEXT: [[INDUCTION:%.*]] = add <2 x i32> [[DOTSPLAT]], <i32 0, i32 1>
+; UNROLL-NO-IC-NEXT: [[DOTSPLATINSERT5:%.*]] = insertelement <2 x i32> poison, i32 [[EXT]], i32 0
+; UNROLL-NO-IC-NEXT: [[DOTSPLAT6:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT5]], <2 x i32> poison, <2 x i32> zeroinitializer
+; UNROLL-NO-IC-NEXT: [[INDUCTION7:%.*]] = add <2 x i32> [[DOTSPLAT6]], <i32 0, i32 1>
; UNROLL-NO-IC-NEXT: br label [[VECTOR_BODY:%.*]]
; UNROLL-NO-IC: vector.body:
; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT: [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
+; UNROLL-NO-IC-NEXT: [[VEC_IND8:%.*]] = phi <2 x i32> [ [[INDUCTION7]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT11:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-IC-NEXT: [[STEP_ADD9:%.*]] = add <2 x i32> [[VEC_IND8]], <i32 2, i32 2>
; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = trunc i32 [[INDEX]] to i8
; UNROLL-NO-IC-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[T]], [[TMP12]]
; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = add i8 [[OFFSET_IDX]], 0
; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[A]], i8 [[TMP14]]
; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP15]], i32 0
; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = bitcast i32* [[TMP17]] to <2 x i32>*
-; UNROLL-NO-IC-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP18]], align 4
+; UNROLL-NO-IC-NEXT: store <2 x i32> [[VEC_IND8]], <2 x i32>* [[TMP18]], align 4
; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP15]], i32 2
; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = bitcast i32* [[TMP19]] to <2 x i32>*
-; UNROLL-NO-IC-NEXT: store <2 x i32> [[STEP_ADD]], <2 x i32>* [[TMP20]], align 4
+; UNROLL-NO-IC-NEXT: store <2 x i32> [[STEP_ADD9]], <2 x i32>* [[TMP20]], align 4
; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], <i32 2, i32 2>
+; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT11]] = add <2 x i32> [[STEP_ADD9]], <i32 2, i32 2>
; UNROLL-NO-IC-NEXT: [[TMP21:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; UNROLL-NO-IC-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
; UNROLL-NO-IC: middle.block:
; INTERLEAVE-NEXT: [[CAST_CRD:%.*]] = trunc i32 [[N_VEC]] to i8
; INTERLEAVE-NEXT: [[IND_END:%.*]] = add i8 [[CAST_CRD]], [[T]]
; INTERLEAVE-NEXT: [[IND_END3:%.*]] = add i32 [[N_VEC]], [[EXT]]
-; INTERLEAVE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[EXT]], i64 0
-; INTERLEAVE-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; INTERLEAVE-NEXT: [[INDUCTION:%.*]] = add nuw nsw <4 x i32> [[DOTSPLAT]], <i32 0, i32 1, i32 2, i32 3>
+; INTERLEAVE-NEXT: [[DOTSPLATINSERT5:%.*]] = insertelement <4 x i32> poison, i32 [[EXT]], i64 0
+; INTERLEAVE-NEXT: [[DOTSPLAT6:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT5]], <4 x i32> poison, <4 x i32> zeroinitializer
+; INTERLEAVE-NEXT: [[INDUCTION7:%.*]] = add nuw nsw <4 x i32> [[DOTSPLAT6]], <i32 0, i32 1, i32 2, i32 3>
; INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]]
; INTERLEAVE: vector.body:
; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; INTERLEAVE-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; INTERLEAVE-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; INTERLEAVE-NEXT: [[VEC_IND8:%.*]] = phi <4 x i32> [ [[INDUCTION7]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT11:%.*]], [[VECTOR_BODY]] ]
+; INTERLEAVE-NEXT: [[STEP_ADD9:%.*]] = add <4 x i32> [[VEC_IND8]], <i32 4, i32 4, i32 4, i32 4>
; INTERLEAVE-NEXT: [[TMP12:%.*]] = trunc i32 [[INDEX]] to i8
; INTERLEAVE-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[TMP12]], [[T]]
; INTERLEAVE-NEXT: [[TMP13:%.*]] = sext i8 [[OFFSET_IDX]] to i64
; INTERLEAVE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP13]]
; INTERLEAVE-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <4 x i32>*
-; INTERLEAVE-NEXT: store <4 x i32> [[VEC_IND]], <4 x i32>* [[TMP15]], align 4
+; INTERLEAVE-NEXT: store <4 x i32> [[VEC_IND8]], <4 x i32>* [[TMP15]], align 4
; INTERLEAVE-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i64 4
; INTERLEAVE-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <4 x i32>*
-; INTERLEAVE-NEXT: store <4 x i32> [[STEP_ADD]], <4 x i32>* [[TMP17]], align 4
+; INTERLEAVE-NEXT: store <4 x i32> [[STEP_ADD9]], <4 x i32>* [[TMP17]], align 4
; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
-; INTERLEAVE-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8>
+; INTERLEAVE-NEXT: [[VEC_IND_NEXT11]] = add <4 x i32> [[VEC_IND8]], <i32 8, i32 8, i32 8, i32 8>
; INTERLEAVE-NEXT: [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; INTERLEAVE-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP36:![0-9]+]]
; INTERLEAVE: middle.block:
; CHECK-NEXT: [[IND_END:%.*]] = add i8 [[T]], [[CAST_CRD]]
; CHECK-NEXT: [[TMP12:%.*]] = mul i32 [[N_VEC]], 4
; CHECK-NEXT: [[IND_END2:%.*]] = add i32 [[EXT_MUL]], [[TMP12]]
-; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i32 0
-; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[INDUCTION:%.*]] = add <2 x i32> [[DOTSPLAT]], <i32 0, i32 4>
+; CHECK-NEXT: [[DOTSPLATINSERT4:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i32 0
+; CHECK-NEXT: [[DOTSPLAT5:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT4]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[INDUCTION6:%.*]] = add <2 x i32> [[DOTSPLAT5]], <i32 0, i32 4>
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND7:%.*]] = phi <2 x i32> [ [[INDUCTION6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT8:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP13:%.*]] = trunc i32 [[INDEX]] to i8
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[T]], [[TMP13]]
; CHECK-NEXT: [[TMP14:%.*]] = add i8 [[OFFSET_IDX]], 0
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i8 [[TMP14]]
; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP15]], i32 0
; CHECK-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <2 x i32>*
-; CHECK-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP17]], align 4
+; CHECK-NEXT: store <2 x i32> [[VEC_IND7]], <2 x i32>* [[TMP17]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 8, i32 8>
+; CHECK-NEXT: [[VEC_IND_NEXT8]] = add <2 x i32> [[VEC_IND7]], <i32 8, i32 8>
; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
; CHECK: middle.block:
; IND-NEXT: [[IND_END:%.*]] = add i8 [[CAST_CRD]], [[T]]
; IND-NEXT: [[TMP12:%.*]] = add i32 [[N_VEC]], [[EXT]]
; IND-NEXT: [[IND_END2:%.*]] = shl i32 [[TMP12]], 2
-; IND-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i64 0
-; IND-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; IND-NEXT: [[INDUCTION:%.*]] = add nuw nsw <2 x i32> [[DOTSPLAT]], <i32 0, i32 4>
+; IND-NEXT: [[DOTSPLATINSERT4:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i64 0
+; IND-NEXT: [[DOTSPLAT5:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT4]], <2 x i32> poison, <2 x i32> zeroinitializer
+; IND-NEXT: [[INDUCTION6:%.*]] = add nuw nsw <2 x i32> [[DOTSPLAT5]], <i32 0, i32 4>
; IND-NEXT: br label [[VECTOR_BODY:%.*]]
; IND: vector.body:
; IND-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; IND-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; IND-NEXT: [[VEC_IND7:%.*]] = phi <2 x i32> [ [[INDUCTION6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT8:%.*]], [[VECTOR_BODY]] ]
; IND-NEXT: [[TMP13:%.*]] = trunc i32 [[INDEX]] to i8
; IND-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[TMP13]], [[T]]
; IND-NEXT: [[TMP14:%.*]] = sext i8 [[OFFSET_IDX]] to i64
; IND-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP14]]
; IND-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <2 x i32>*
-; IND-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP16]], align 4
+; IND-NEXT: store <2 x i32> [[VEC_IND7]], <2 x i32>* [[TMP16]], align 4
; IND-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; IND-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 8, i32 8>
+; IND-NEXT: [[VEC_IND_NEXT8]] = add <2 x i32> [[VEC_IND7]], <i32 8, i32 8>
; IND-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; IND-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
; IND: middle.block:
; UNROLL-NEXT: [[IND_END:%.*]] = add i8 [[CAST_CRD]], [[T]]
; UNROLL-NEXT: [[TMP12:%.*]] = add i32 [[N_VEC]], [[EXT]]
; UNROLL-NEXT: [[IND_END2:%.*]] = shl i32 [[TMP12]], 2
-; UNROLL-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i64 0
-; UNROLL-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; UNROLL-NEXT: [[INDUCTION:%.*]] = add nuw nsw <2 x i32> [[DOTSPLAT]], <i32 0, i32 4>
+; UNROLL-NEXT: [[DOTSPLATINSERT5:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i64 0
+; UNROLL-NEXT: [[DOTSPLAT6:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT5]], <2 x i32> poison, <2 x i32> zeroinitializer
+; UNROLL-NEXT: [[INDUCTION7:%.*]] = add nuw nsw <2 x i32> [[DOTSPLAT6]], <i32 0, i32 4>
; UNROLL-NEXT: br label [[VECTOR_BODY:%.*]]
; UNROLL: vector.body:
; UNROLL-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NEXT: [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], <i32 8, i32 8>
+; UNROLL-NEXT: [[VEC_IND8:%.*]] = phi <2 x i32> [ [[INDUCTION7]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT11:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NEXT: [[STEP_ADD9:%.*]] = add <2 x i32> [[VEC_IND8]], <i32 8, i32 8>
; UNROLL-NEXT: [[TMP13:%.*]] = trunc i32 [[INDEX]] to i8
; UNROLL-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[TMP13]], [[T]]
; UNROLL-NEXT: [[TMP14:%.*]] = sext i8 [[OFFSET_IDX]] to i64
; UNROLL-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP14]]
; UNROLL-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <2 x i32>*
-; UNROLL-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP16]], align 4
+; UNROLL-NEXT: store <2 x i32> [[VEC_IND8]], <2 x i32>* [[TMP16]], align 4
; UNROLL-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP15]], i64 2
; UNROLL-NEXT: [[TMP18:%.*]] = bitcast i32* [[TMP17]] to <2 x i32>*
-; UNROLL-NEXT: store <2 x i32> [[STEP_ADD]], <2 x i32>* [[TMP18]], align 4
+; UNROLL-NEXT: store <2 x i32> [[STEP_ADD9]], <2 x i32>* [[TMP18]], align 4
; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; UNROLL-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 16, i32 16>
+; UNROLL-NEXT: [[VEC_IND_NEXT11]] = add <2 x i32> [[VEC_IND8]], <i32 16, i32 16>
; UNROLL-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; UNROLL-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
; UNROLL: middle.block:
; UNROLL-NO-IC-NEXT: [[IND_END:%.*]] = add i8 [[T]], [[CAST_CRD]]
; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = mul i32 [[N_VEC]], 4
; UNROLL-NO-IC-NEXT: [[IND_END2:%.*]] = add i32 [[EXT_MUL]], [[TMP12]]
-; UNROLL-NO-IC-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i32 0
-; UNROLL-NO-IC-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; UNROLL-NO-IC-NEXT: [[INDUCTION:%.*]] = add <2 x i32> [[DOTSPLAT]], <i32 0, i32 4>
+; UNROLL-NO-IC-NEXT: [[DOTSPLATINSERT5:%.*]] = insertelement <2 x i32> poison, i32 [[EXT_MUL]], i32 0
+; UNROLL-NO-IC-NEXT: [[DOTSPLAT6:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT5]], <2 x i32> poison, <2 x i32> zeroinitializer
+; UNROLL-NO-IC-NEXT: [[INDUCTION7:%.*]] = add <2 x i32> [[DOTSPLAT6]], <i32 0, i32 4>
; UNROLL-NO-IC-NEXT: br label [[VECTOR_BODY:%.*]]
; UNROLL-NO-IC: vector.body:
; UNROLL-NO-IC-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UNROLL-NO-IC-NEXT: [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], <i32 8, i32 8>
+; UNROLL-NO-IC-NEXT: [[VEC_IND8:%.*]] = phi <2 x i32> [ [[INDUCTION7]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT11:%.*]], [[VECTOR_BODY]] ]
+; UNROLL-NO-IC-NEXT: [[STEP_ADD9:%.*]] = add <2 x i32> [[VEC_IND8]], <i32 8, i32 8>
; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = trunc i32 [[INDEX]] to i8
; UNROLL-NO-IC-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[T]], [[TMP13]]
; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = add i8 [[OFFSET_IDX]], 0
; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[A]], i8 [[TMP15]]
; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP16]], i32 0
; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <2 x i32>*
-; UNROLL-NO-IC-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP19]], align 4
+; UNROLL-NO-IC-NEXT: store <2 x i32> [[VEC_IND8]], <2 x i32>* [[TMP19]], align 4
; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP16]], i32 2
; UNROLL-NO-IC-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <2 x i32>*
-; UNROLL-NO-IC-NEXT: store <2 x i32> [[STEP_ADD]], <2 x i32>* [[TMP21]], align 4
+; UNROLL-NO-IC-NEXT: store <2 x i32> [[STEP_ADD9]], <2 x i32>* [[TMP21]], align 4
; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], <i32 8, i32 8>
+; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT11]] = add <2 x i32> [[STEP_ADD9]], <i32 8, i32 8>
; UNROLL-NO-IC-NEXT: [[TMP22:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; UNROLL-NO-IC-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
; UNROLL-NO-IC: middle.block:
; INTERLEAVE-NEXT: [[IND_END:%.*]] = add i8 [[CAST_CRD]], [[T]]
; INTERLEAVE-NEXT: [[TMP12:%.*]] = add i32 [[N_VEC]], [[EXT]]
; INTERLEAVE-NEXT: [[IND_END2:%.*]] = shl i32 [[TMP12]], 2
-; INTERLEAVE-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[EXT_MUL]], i64 0
-; INTERLEAVE-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; INTERLEAVE-NEXT: [[INDUCTION:%.*]] = add nuw nsw <4 x i32> [[DOTSPLAT]], <i32 0, i32 4, i32 8, i32 12>
+; INTERLEAVE-NEXT: [[DOTSPLATINSERT5:%.*]] = insertelement <4 x i32> poison, i32 [[EXT_MUL]], i64 0
+; INTERLEAVE-NEXT: [[DOTSPLAT6:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT5]], <4 x i32> poison, <4 x i32> zeroinitializer
+; INTERLEAVE-NEXT: [[INDUCTION7:%.*]] = add nuw nsw <4 x i32> [[DOTSPLAT6]], <i32 0, i32 4, i32 8, i32 12>
; INTERLEAVE-NEXT: br label [[VECTOR_BODY:%.*]]
; INTERLEAVE: vector.body:
; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; INTERLEAVE-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; INTERLEAVE-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 16, i32 16, i32 16, i32 16>
+; INTERLEAVE-NEXT: [[VEC_IND8:%.*]] = phi <4 x i32> [ [[INDUCTION7]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT11:%.*]], [[VECTOR_BODY]] ]
+; INTERLEAVE-NEXT: [[STEP_ADD9:%.*]] = add <4 x i32> [[VEC_IND8]], <i32 16, i32 16, i32 16, i32 16>
; INTERLEAVE-NEXT: [[TMP13:%.*]] = trunc i32 [[INDEX]] to i8
; INTERLEAVE-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[TMP13]], [[T]]
; INTERLEAVE-NEXT: [[TMP14:%.*]] = sext i8 [[OFFSET_IDX]] to i64
; INTERLEAVE-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP14]]
; INTERLEAVE-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <4 x i32>*
-; INTERLEAVE-NEXT: store <4 x i32> [[VEC_IND]], <4 x i32>* [[TMP16]], align 4
+; INTERLEAVE-NEXT: store <4 x i32> [[VEC_IND8]], <4 x i32>* [[TMP16]], align 4
; INTERLEAVE-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP15]], i64 4
; INTERLEAVE-NEXT: [[TMP18:%.*]] = bitcast i32* [[TMP17]] to <4 x i32>*
-; INTERLEAVE-NEXT: store <4 x i32> [[STEP_ADD]], <4 x i32>* [[TMP18]], align 4
+; INTERLEAVE-NEXT: store <4 x i32> [[STEP_ADD9]], <4 x i32>* [[TMP18]], align 4
; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
-; INTERLEAVE-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 32, i32 32, i32 32, i32 32>
+; INTERLEAVE-NEXT: [[VEC_IND_NEXT11]] = add <4 x i32> [[VEC_IND8]], <i32 32, i32 32, i32 32, i32 32>
; INTERLEAVE-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; INTERLEAVE-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP38:![0-9]+]]
; INTERLEAVE: middle.block:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]]
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>*
-; CHECK-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP3]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]]
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <2 x i32>*
+; CHECK-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP4]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
-; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[K]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; UNROLL-NO-IC-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; UNROLL-NO-IC-NEXT: [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
; UNROLL-NO-IC-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; UNROLL-NO-IC-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 2
-; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]]
-; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP1]]
-; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0
-; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <2 x i32>*
-; UNROLL-NO-IC-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP5]], align 4
-; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 2
+; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 2
+; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]]
+; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP2]]
+; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <2 x i32>*
-; UNROLL-NO-IC-NEXT: store <2 x i32> [[STEP_ADD]], <2 x i32>* [[TMP7]], align 4
+; UNROLL-NO-IC-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP7]], align 4
+; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 2
+; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <2 x i32>*
+; UNROLL-NO-IC-NEXT: store <2 x i32> [[STEP_ADD]], <2 x i32>* [[TMP9]], align 4
; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], <i32 2, i32 2>
-; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; UNROLL-NO-IC-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]]
+; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; UNROLL-NO-IC-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP40:![0-9]+]]
; UNROLL-NO-IC: middle.block:
; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[K]], [[N_VEC]]
; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[INDEX]] to i32
; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], 0
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP6]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <2 x i32>*
-; CHECK-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP9]], align 4
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP6]]
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <2 x i32>*
+; CHECK-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP10]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
-; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
+; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[K]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; UNROLL-NO-IC-NEXT: [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = trunc i64 [[INDEX]] to i32
; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], 0
-; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = add i32 [[TMP5]], 2
-; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP6]]
-; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP7]]
-; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
-; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <2 x i32>*
-; UNROLL-NO-IC-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP11]], align 4
-; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 2
+; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = add i32 [[TMP5]], 2
+; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP6]]
+; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP8]]
+; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0
; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <2 x i32>*
-; UNROLL-NO-IC-NEXT: store <2 x i32> [[STEP_ADD]], <2 x i32>* [[TMP13]], align 4
+; UNROLL-NO-IC-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP13]], align 4
+; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 2
+; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <2 x i32>*
+; UNROLL-NO-IC-NEXT: store <2 x i32> [[STEP_ADD]], <2 x i32>* [[TMP15]], align 4
; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], <i32 2, i32 2>
-; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; UNROLL-NO-IC-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
+; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; UNROLL-NO-IC-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]]
; UNROLL-NO-IC: middle.block:
; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[K]], [[N_VEC]]
; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[I]], [[INDEX]]
; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFFSET_IDX]], 0
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <2 x i32>*
-; CHECK-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP4]], align 4
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP1]]
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <2 x i32>*
+; CHECK-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP5]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
-; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]]
+; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; UNROLL-NO-IC-NEXT: [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
; UNROLL-NO-IC-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[I]], [[INDEX]]
; UNROLL-NO-IC-NEXT: [[TMP1:%.*]] = add i32 [[OFFSET_IDX]], 0
-; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = add i32 [[OFFSET_IDX]], 2
-; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP1]]
-; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP2]]
-; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0
-; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <2 x i32>*
-; UNROLL-NO-IC-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP6]], align 4
-; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 2
+; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = add i32 [[OFFSET_IDX]], 2
+; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP1]]
+; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP3]]
+; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <2 x i32>*
-; UNROLL-NO-IC-NEXT: store <2 x i32> [[STEP_ADD]], <2 x i32>* [[TMP8]], align 4
+; UNROLL-NO-IC-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP8]], align 4
+; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 2
+; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <2 x i32>*
+; UNROLL-NO-IC-NEXT: store <2 x i32> [[STEP_ADD]], <2 x i32>* [[TMP10]], align 4
; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], <i32 2, i32 2>
-; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; UNROLL-NO-IC-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]]
+; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; UNROLL-NO-IC-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]]
; UNROLL-NO-IC: middle.block:
; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
; UNROLL-NO-IC-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-NEXT: [[VEC_IND]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[INDEX]] to i32
; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], 0
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[VEC_IND]], <2 x i32> <i32 1, i32 2>
-; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[SRC:%.*]], align 4
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TMP3]], i32 0
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[VEC_IND]], <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[SRC:%.*]], align 4
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TMP4]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <2 x i32> [[BROADCAST_SPLAT]], [[TMP2]]
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[DST:%.*]], i32 [[TMP1]]
-; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i32> [[VEC_IND]], [[TMP4]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[TMP5]], i32 0
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <2 x i32>*
-; CHECK-NEXT: store <2 x i32> [[TMP6]], <2 x i32>* [[TMP8]], align 4
+; CHECK-NEXT: [[TMP5:%.*]] = mul nsw <2 x i32> [[BROADCAST_SPLAT]], [[TMP3]]
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[DST:%.*]], i32 [[TMP1]]
+; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i32> [[VEC_IND]], [[TMP5]]
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP6]], i32 0
+; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <2 x i32>*
+; CHECK-NEXT: store <2 x i32> [[TMP7]], <2 x i32>* [[TMP9]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
-; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
-; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP52:![0-9]+]]
+; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP52:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 100, 100
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[VEC_IND]], i32 1
; UNROLL-NO-IC-NEXT: [[STEP_ADD]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
; UNROLL-NO-IC-NEXT: [[TMP0:%.*]] = trunc i64 [[INDEX]] to i32
; UNROLL-NO-IC-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], 0
-; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = add i32 [[TMP0]], 2
-; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[VEC_IND]], <2 x i32> <i32 1, i32 2>
-; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[VEC_IND]], <2 x i32> [[STEP_ADD]], <2 x i32> <i32 1, i32 2>
-; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = load i32, i32* [[SRC:%.*]], align 4
-; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TMP5]], i32 0
+; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], 2
+; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[VEC_IND]], <2 x i32> <i32 1, i32 2>
+; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[VEC_IND]], <2 x i32> [[STEP_ADD]], <2 x i32> <i32 1, i32 2>
+; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = load i32, i32* [[SRC:%.*]], align 4
+; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0
; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = load i32, i32* [[SRC]], align 4
-; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP6]], i32 0
+; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = load i32, i32* [[SRC]], align 4
+; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP8]], i32 0
; UNROLL-NO-IC-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT3]], <2 x i32> poison, <2 x i32> zeroinitializer
-; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = mul nsw <2 x i32> [[BROADCAST_SPLAT]], [[TMP3]]
-; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = mul nsw <2 x i32> [[BROADCAST_SPLAT4]], [[TMP4]]
-; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[DST:%.*]], i32 [[TMP1]]
-; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[DST]], i32 [[TMP2]]
-; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = add <2 x i32> [[VEC_IND]], [[TMP7]]
-; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = add <2 x i32> [[STEP_ADD]], [[TMP8]]
-; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = getelementptr i32, i32* [[TMP9]], i32 0
-; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <2 x i32>*
-; UNROLL-NO-IC-NEXT: store <2 x i32> [[TMP11]], <2 x i32>* [[TMP14]], align 4
-; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = getelementptr i32, i32* [[TMP9]], i32 2
+; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = mul nsw <2 x i32> [[BROADCAST_SPLAT]], [[TMP5]]
+; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = mul nsw <2 x i32> [[BROADCAST_SPLAT4]], [[TMP6]]
+; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[DST:%.*]], i32 [[TMP1]]
+; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[DST]], i32 [[TMP3]]
+; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = add <2 x i32> [[VEC_IND]], [[TMP9]]
+; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = add <2 x i32> [[STEP_ADD]], [[TMP10]]
+; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = getelementptr i32, i32* [[TMP11]], i32 0
; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <2 x i32>*
-; UNROLL-NO-IC-NEXT: store <2 x i32> [[TMP12]], <2 x i32>* [[TMP16]], align 4
+; UNROLL-NO-IC-NEXT: store <2 x i32> [[TMP13]], <2 x i32>* [[TMP16]], align 4
+; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = getelementptr i32, i32* [[TMP11]], i32 2
+; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = bitcast i32* [[TMP17]] to <2 x i32>*
+; UNROLL-NO-IC-NEXT: store <2 x i32> [[TMP14]], <2 x i32>* [[TMP18]], align 4
; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], <i32 2, i32 2>
-; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
-; UNROLL-NO-IC-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP52:![0-9]+]]
+; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; UNROLL-NO-IC-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP52:![0-9]+]]
; UNROLL-NO-IC: middle.block:
; UNROLL-NO-IC-NEXT: [[CMP_N:%.*]] = icmp eq i64 100, 100
; UNROLL-NO-IC-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <2 x i32> [[STEP_ADD]], i32 1
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
define void @array_at_plus_one(i32 %n) {
; CHECK-LABEL: @array_at_plus_one(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N:%.*]], -1
-; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
-; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP4:%.*]] = add nsw i64 [[TMP3]], 12
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @array, i64 0, i64 [[TMP4]]
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
-; CHECK-NEXT: store <4 x i32> [[VEC_IND]], <4 x i32>* [[TMP7]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[LOOP:%.*]]
-; CHECK: loop:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[IV_PLUS_12:%.*]] = add nsw i64 [[IV]], 12
-; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @array, i64 0, i64 [[IV_PLUS_12]]
-; CHECK-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i32
-; CHECK-NEXT: store i32 [[IV_TRUNC]], i32* [[GEP]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[IV_NEXT]] to i32
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: exit:
-; CHECK-NEXT: ret void
+; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK: [[VEC_IV_TRUNC:%.+]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, %vector.ph ], [ [[VEC_IV_TRUNC_NEXT:%.+]], %vector.body ]
+; CHECK: [[T1:%.+]] = add i64 %index, 0
+; CHECK: [[T2:%.+]] = add nsw i64 [[T1]], 12
+; CHECK-NEXT: [[GEP:%.+]] = getelementptr inbounds [1024 x i32], [1024 x i32]* @array, i64 0, i64 [[T2]]
+; CHECK-NEXT: [[GEP0:%.+]] = getelementptr inbounds i32, i32* [[GEP]], i32 0
+; CHECK-NEXT: [[BC:%.+]] = bitcast i32* [[GEP0]] to <4 x i32>*
+; CHECK-NEXT: store <4 x i32> [[VEC_IV_TRUNC]], <4 x i32>* [[BC]]
+; CHECK: [[VEC_IV_TRUNC_NEXT]] = add <4 x i32> [[VEC_IV_TRUNC]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK: ret void
;
entry:
br label %loop
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -enable-interleaved-mem-accesses=true < %s | FileCheck %s
; When merging two stores with interleaved access vectorization, make sure we
%struct.Vec2r = type { double, double }
define void @foobar(%struct.Vec4r* nocapture readonly %p, i32 %i)
-; CHECK-LABEL: @foobar(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CP:%.*]] = alloca [20 x %struct.Vec2r], align 8
-; CHECK-NEXT: [[TMP0:%.*]] = bitcast [20 x %struct.Vec2r]* [[CP]] to i8*
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_VEC4R:%.*]], %struct.Vec4r* [[P:%.*]], i64 [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_VEC4R]], %struct.Vec4r* [[P]], i64 [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = load double, double* [[TMP3]], align 8, !tbaa [[TBAA3:![0-9]+]]
-; CHECK-NEXT: [[TMP6:%.*]] = load double, double* [[TMP4]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i32 0
-; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[TMP6]], i32 1
-; CHECK-NEXT: [[TMP9:%.*]] = fmul <2 x double> [[TMP8]], <double 2.000000e+00, double 2.000000e+00>
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_VEC4R]], %struct.Vec4r* [[P]], i64 [[TMP1]], i32 1
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_VEC4R]], %struct.Vec4r* [[P]], i64 [[TMP2]], i32 1
-; CHECK-NEXT: [[TMP12:%.*]] = load double, double* [[TMP10]], align 8, !tbaa [[TBAA8:![0-9]+]]
-; CHECK-NEXT: [[TMP13:%.*]] = load double, double* [[TMP11]], align 8, !tbaa [[TBAA8]]
-; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x double> poison, double [[TMP12]], i32 0
-; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x double> [[TMP14]], double [[TMP13]], i32 1
-; CHECK-NEXT: [[TMP16:%.*]] = fmul <2 x double> [[TMP15]], <double 3.000000e+00, double 3.000000e+00>
-; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds [20 x %struct.Vec2r], [20 x %struct.Vec2r]* [[CP]], i64 0, i64 [[TMP1]], i32 1
-; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds double, double* [[TMP17]], i32 -1
-; CHECK-NEXT: [[TMP19:%.*]] = bitcast double* [[TMP18]] to <4 x double>*
-; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> [[TMP16]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP20]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
-; CHECK-NEXT: store <4 x double> [[INTERLEAVED_VEC]], <4 x double>* [[TMP19]], align 8, !tbaa [[TBAA9:![0-9]+]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4
-; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 4, 4
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: [[ARRAYDECAY:%.*]] = getelementptr inbounds [20 x %struct.Vec2r], [20 x %struct.Vec2r]* [[CP]], i64 0, i64 0
-; CHECK-NEXT: call void @g(%struct.Vec2r* nonnull [[ARRAYDECAY]])
-; CHECK-NEXT: ret void
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[X:%.*]] = getelementptr inbounds [[STRUCT_VEC4R]], %struct.Vec4r* [[P]], i64 [[INDVARS_IV]], i32 0
-; CHECK-NEXT: [[TMP22:%.*]] = load double, double* [[X]], align 8, !tbaa [[TBAA3]]
-; CHECK-NEXT: [[MUL:%.*]] = fmul double [[TMP22]], 2.000000e+00
-; CHECK-NEXT: [[X4:%.*]] = getelementptr inbounds [20 x %struct.Vec2r], [20 x %struct.Vec2r]* [[CP]], i64 0, i64 [[INDVARS_IV]], i32 0
-; CHECK-NEXT: store double [[MUL]], double* [[X4]], align 8, !tbaa [[TBAA12:![0-9]+]]
-; CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds [[STRUCT_VEC4R]], %struct.Vec4r* [[P]], i64 [[INDVARS_IV]], i32 1
-; CHECK-NEXT: [[TMP23:%.*]] = load double, double* [[Y]], align 8, !tbaa [[TBAA8]]
-; CHECK-NEXT: [[MUL7:%.*]] = fmul double [[TMP23]], 3.000000e+00
-; CHECK-NEXT: [[Y10:%.*]] = getelementptr inbounds [20 x %struct.Vec2r], [20 x %struct.Vec2r]* [[CP]], i64 0, i64 [[INDVARS_IV]], i32 1
-; CHECK-NEXT: store double [[MUL7]], double* [[Y10]], align 8, !tbaa [[TBAA14:![0-9]+]]
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
-;
{
entry:
%cp = alloca [20 x %struct.Vec2r], align 8
%x4 = getelementptr inbounds [20 x %struct.Vec2r], [20 x %struct.Vec2r]* %cp, i64 0, i64 %indvars.iv, i32 0
; The new store should alias any double rather than one of the fields of Vec2r.
+; CHECK: store <4 x double> {{.*}} !tbaa ![[STORE_TBAA:[0-9]+]]
+; CHECK-DAG: ![[DOUBLE_TBAA:[0-9]+]] = !{!"double", !{{[0-9+]}}, i64 0}
+; CHECK-DAG: ![[STORE_TBAA]] = !{![[DOUBLE_TBAA]], ![[DOUBLE_TBAA]], i64 0}
store double %mul, double* %x4, align 8, !tbaa !8
%y = getelementptr inbounds %struct.Vec4r, %struct.Vec4r* %p, i64 %indvars.iv, i32 1
%2 = load double, double* %y, align 8, !tbaa !10
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; REQUIRES: asserts
; RUN: opt -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses -debug-only=loop-vectorize,vectorutils -disable-output < %s 2>&1 | FileCheck %s -check-prefix=STRIDED_UNMASKED
; RUN: opt -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses -enable-masked-interleaved-mem-accesses -debug-only=loop-vectorize,vectorutils -disable-output < %s 2>&1 | FileCheck %s -check-prefix=STRIDED_MASKED
target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
-; We test here that the loop-vectorizer forms an interleave-groups from
+; We test here that the loop-vectorizer forms an interleave-groups from
; predicated memory accesses only if they are both in the same (predicated)
; block (first scenario below).
; If the accesses are not in the same predicated block, an interleave-group
;}
-; STRIDED_UNMASKED: LV: Checking a loop in 'masked_strided1'
+; STRIDED_UNMASKED: LV: Checking a loop in 'masked_strided1'
; STRIDED_UNMASKED: LV: Analyzing interleaved accesses...
-; STRIDED_UNMASKED-NOT: LV: Creating an interleave group
+; STRIDED_UNMASKED-NOT: LV: Creating an interleave group
-; STRIDED_MASKED: LV: Checking a loop in 'masked_strided1'
+; STRIDED_MASKED: LV: Checking a loop in 'masked_strided1'
; STRIDED_MASKED: LV: Analyzing interleaved accesses...
; STRIDED_MASKED-NEXT: LV: Creating an interleave group with: store i8 %{{.*}}, i8* %{{.*}}, align 1
; STRIDED_MASKED-NEXT: LV: Inserted: store i8 %{{.*}}, i8* %{{.*}}, align 1
; }
;}
-; STRIDED_UNMASKED: LV: Checking a loop in 'masked_strided2'
+; STRIDED_UNMASKED: LV: Checking a loop in 'masked_strided2'
; STRIDED_UNMASKED: LV: Analyzing interleaved accesses...
; STRIDED_UNMASKED-NEXT: LV: Creating an interleave group with: store i8 1, i8* %{{.*}}, align 1
; STRIDED_UNMASKED-NEXT: LV: Invalidate candidate interleaved store group due to gaps.
-; STRIDED_UNMASKED-NOT: LV: Creating an interleave group
+; STRIDED_UNMASKED-NOT: LV: Creating an interleave group
-; STRIDED_MASKED: LV: Checking a loop in 'masked_strided2'
+; STRIDED_MASKED: LV: Checking a loop in 'masked_strided2'
; STRIDED_MASKED: LV: Analyzing interleaved accesses...
; STRIDED_MASKED-NEXT: LV: Creating an interleave group with: store i8 2, i8* %{{.*}}, align 1
; STRIDED_MASKED-NEXT: LV: Creating an interleave group with: store i8 1, i8* %{{.*}}, align 1
;}
-; STRIDED_UNMASKED: LV: Checking a loop in 'masked_strided3'
+; STRIDED_UNMASKED: LV: Checking a loop in 'masked_strided3'
; STRIDED_UNMASKED: LV: Analyzing interleaved accesses...
-; STRIDED_UNMASKED-NOT: LV: Creating an interleave group
+; STRIDED_UNMASKED-NOT: LV: Creating an interleave group
-; STRIDED_MASKED: LV: Checking a loop in 'masked_strided3'
+; STRIDED_MASKED: LV: Checking a loop in 'masked_strided3'
; STRIDED_MASKED: LV: Analyzing interleaved accesses...
; STRIDED_MASKED-NEXT: LV: Creating an interleave group with: store i8 2, i8* %{{.*}}, align 1
; STRIDED_MASKED-NEXT: LV: Creating an interleave group with: store i8 1, i8* %{{.*}}, align 1
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[NEXT_GEP]] to <12 x i32>*
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <12 x i32>, <12 x i32>* [[TMP1]], align 4
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
-; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
-; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[STRIDED_VEC2]], <i32 2, i32 2, i32 2, i32 2>
-; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[STRIDED_VEC3]], <i32 3, i32 3, i32 3, i32 3>
+; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[STRIDED_VEC5]], <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[STRIDED_VEC6]], <i32 3, i32 3, i32 3, i32 3>
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [1024 x %struct.ST3], [1024 x %struct.ST3]* @S, i64 0, i64 [[INDEX]], i32 2
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 -2
; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <12 x i32>*
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 1023, i32 1022, i32 1021, i32 1020>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND1:%.*]] = phi <4 x i32> [ <i32 1023, i32 1022, i32 1021, i32 1020>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_ST2:%.*]], %struct.ST2* [[A:%.*]], i64 [[OFFSET_IDX]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 -6
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP2]], align 4
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i32> [[STRIDED_VEC]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; CHECK-NEXT: [[REVERSE2:%.*]] = shufflevector <4 x i32> [[STRIDED_VEC1]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[REVERSE]], [[VEC_IND]]
-; CHECK-NEXT: [[TMP4:%.*]] = sub nsw <4 x i32> [[REVERSE2]], [[VEC_IND]]
+; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT: [[REVERSE4:%.*]] = shufflevector <4 x i32> [[STRIDED_VEC3]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[REVERSE]], [[VEC_IND1]]
+; CHECK-NEXT: [[TMP4:%.*]] = sub nsw <4 x i32> [[REVERSE4]], [[VEC_IND1]]
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_ST2]], %struct.ST2* [[B:%.*]], i64 [[OFFSET_IDX]], i32 1
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 -7
; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
-; CHECK-NEXT: [[REVERSE3:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[REVERSE4:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[REVERSE3]], <4 x i32> [[REVERSE4]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT: [[REVERSE5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT: [[REVERSE6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[REVERSE5]], <4 x i32> [[REVERSE6]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
; CHECK-NEXT: store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[TMP7]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 -4, i32 -4, i32 -4, i32 -4>
+; CHECK-NEXT: [[VEC_IND_NEXT2]] = add <4 x i32> [[VEC_IND1]], <i32 -4, i32 -4, i32 -4, i32 -4>
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[NEXT_GEP]] to <12 x i32>*
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <12 x i32>, <12 x i32>* [[TMP1]], align 4
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
-; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
-; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
+; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[STRIDED_VEC]], [[VEC_IND]]
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[NEXT_GEP]], i64 2
-; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[STRIDED_VEC2]], [[VEC_IND]]
-; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[STRIDED_VEC3]], [[VEC_IND]]
+; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[STRIDED_VEC5]], [[VEC_IND]]
+; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[STRIDED_VEC6]], [[VEC_IND]]
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i64 -2
; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <12 x i32>*
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
; TAILFOLD: vector.body:
; TAILFOLD-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
; TAILFOLD-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE2]] ]
-; TAILFOLD-NEXT: [[TMP1:%.*]] = icmp ule <2 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; TAILFOLD-NEXT: [[TMP2:%.*]] = sext <2 x i32> [[VEC_IND]] to <2 x i64>
-; TAILFOLD-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
-; TAILFOLD-NEXT: br i1 [[TMP3]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; TAILFOLD-NEXT: [[TMP3:%.*]] = icmp ule <2 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; TAILFOLD-NEXT: [[TMP4:%.*]] = sext <2 x i32> [[VEC_IND]] to <2 x i64>
+; TAILFOLD-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP3]], i32 0
+; TAILFOLD-NEXT: br i1 [[TMP5]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
; TAILFOLD: pred.store.if:
-; TAILFOLD-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0
-; TAILFOLD-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP4]]
-; TAILFOLD-NEXT: store i16 0, i16* [[TMP5]], align 4
+; TAILFOLD-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
+; TAILFOLD-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP6]]
+; TAILFOLD-NEXT: store i16 0, i16* [[TMP7]], align 4
; TAILFOLD-NEXT: br label [[PRED_STORE_CONTINUE]]
; TAILFOLD: pred.store.continue:
-; TAILFOLD-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
-; TAILFOLD-NEXT: br i1 [[TMP6]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
+; TAILFOLD-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP3]], i32 1
+; TAILFOLD-NEXT: br i1 [[TMP8]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
; TAILFOLD: pred.store.if1:
-; TAILFOLD-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1
-; TAILFOLD-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, i16* [[P]], i64 [[TMP7]]
-; TAILFOLD-NEXT: store i16 0, i16* [[TMP8]], align 4
+; TAILFOLD-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; TAILFOLD-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, i16* [[P]], i64 [[TMP9]]
+; TAILFOLD-NEXT: store i16 0, i16* [[TMP10]], align 4
; TAILFOLD-NEXT: br label [[PRED_STORE_CONTINUE2]]
; TAILFOLD: pred.store.continue2:
; TAILFOLD-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2
; TAILFOLD-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
-; TAILFOLD-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; TAILFOLD-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; TAILFOLD-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; TAILFOLD-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; TAILFOLD: middle.block:
; TAILFOLD-NEXT: br i1 true, label [[IF_END:%.*]], label [[SCALAR_PH]]
; TAILFOLD: scalar.ph:
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 0
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <2 x i16>*
-; CHECK-NEXT: store <2 x i16> zeroinitializer, <2 x i16>* [[TMP7]], align 4
+; CHECK-NEXT: [[TMP5:%.*]] = sext i32 [[TMP3]] to i64
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP5]]
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[TMP6]], i32 0
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16* [[TMP7]] to <2 x i16>*
+; CHECK-NEXT: store <2 x i16> zeroinitializer, <2 x i16>* [[TMP8]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 0
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <2 x i16>*
-; CHECK-NEXT: store <2 x i16> zeroinitializer, <2 x i16>* [[TMP7]], align 4
+; CHECK-NEXT: [[TMP5:%.*]] = sext i32 [[TMP3]] to i64
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP5]]
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[TMP6]], i32 0
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16* [[TMP7]] to <2 x i16>*
+; CHECK-NEXT: store <2 x i16> zeroinitializer, <2 x i16>* [[TMP8]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 0
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <2 x i16>*
-; CHECK-NEXT: store <2 x i16> zeroinitializer, <2 x i16>* [[TMP7]], align 4
+; CHECK-NEXT: [[TMP5:%.*]] = sext i32 [[TMP3]] to i64
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP5]]
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[TMP6]], i32 0
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16* [[TMP7]] to <2 x i16>*
+; CHECK-NEXT: store <2 x i16> zeroinitializer, <2 x i16>* [[TMP8]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 0
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <2 x i16>*
-; CHECK-NEXT: store <2 x i16> zeroinitializer, <2 x i16>* [[TMP7]], align 4
+; CHECK-NEXT: [[TMP5:%.*]] = sext i32 [[TMP3]] to i64
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP5]]
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[TMP6]], i32 0
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16* [[TMP7]] to <2 x i16>*
+; CHECK-NEXT: store <2 x i16> zeroinitializer, <2 x i16>* [[TMP8]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 0
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <2 x i16>*
-; CHECK-NEXT: store <2 x i16> zeroinitializer, <2 x i16>* [[TMP7]], align 4
+; CHECK-NEXT: [[TMP5:%.*]] = sext i32 [[TMP3]] to i64
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP5]]
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[TMP6]], i32 0
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16* [[TMP7]] to <2 x i16>*
+; CHECK-NEXT: store <2 x i16> zeroinitializer, <2 x i16>* [[TMP8]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 0
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <2 x i16>*
-; CHECK-NEXT: store <2 x i16> zeroinitializer, <2 x i16>* [[TMP7]], align 4
+; CHECK-NEXT: [[TMP5:%.*]] = sext i32 [[TMP3]] to i64
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP5]]
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[TMP6]], i32 0
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16* [[TMP7]] to <2 x i16>*
+; CHECK-NEXT: store <2 x i16> zeroinitializer, <2 x i16>* [[TMP8]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 0
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <2 x i16>*
-; CHECK-NEXT: store <2 x i16> zeroinitializer, <2 x i16>* [[TMP7]], align 4
-; CHECK-NEXT: [[TMP8:%.*]] = add nsw <2 x i32> [[VEC_IND]], <i32 1, i32 1>
+; CHECK-NEXT: [[TMP5:%.*]] = sext i32 [[TMP3]] to i64
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[P:%.*]], i64 [[TMP5]]
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[TMP6]], i32 0
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16* [[TMP7]] to <2 x i16>*
+; CHECK-NEXT: store <2 x i16> zeroinitializer, <2 x i16>* [[TMP8]], align 4
+; CHECK-NEXT: [[TMP9:%.*]] = add nsw <2 x i32> [[VEC_IND]], <i32 1, i32 1>
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
-; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, i32* [[ADDR:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4
-; CHECK-NEXT: [[TMP4]] = add <2 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, i32* [[ADDR:%.*]], i64 [[TMP0]]
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i32, i32* [[TMP2]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <2 x i32>*
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP4]], align 4
+; CHECK-NEXT: [[TMP5]] = add <2 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
-; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
+; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP4]])
+; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP5]])
; CHECK-NEXT: br label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
; CHECK: loop.header:
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV]], 200
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[LOOP_LATCH]]
; CHECK: loop.latch:
-; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[GEP]], align 4
-; CHECK-NEXT: [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[TMP7]]
+; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[GEP]], align 4
+; CHECK-NEXT: [[ACCUM_NEXT]] = add i32 [[ACCUM]], [[TMP8]]
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-NEXT: [[EXITCOND2_NOT:%.*]] = icmp eq i64 [[IV]], 400
; CHECK-NEXT: br i1 [[EXITCOND2_NOT]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP21:![0-9]+]]
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
; RUN: opt < %s -S -loop-vectorize -debug-only=loop-vectorize 2>&1 | FileCheck %s
; CHECK: LV: Can't vectorize due to memory conflicts
define void @test_loop_novect(double** %arr, i64 %n) {
-; CHECK-LABEL: @test_loop_novect(
-; CHECK-NEXT: for.body.lr.ph:
-; CHECK-NEXT: [[T:%.*]] = load double*, double** [[ARR:%.*]], align 8
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH:%.*]] ], [ [[I_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[A:%.*]] = getelementptr inbounds double, double* [[T]], i64 [[I]]
-; CHECK-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1
-; CHECK-NEXT: [[A_NEXT:%.*]] = getelementptr inbounds double, double* [[T]], i64 [[I_NEXT]]
-; CHECK-NEXT: [[T1:%.*]] = load double, double* [[A]], align 8
-; CHECK-NEXT: [[T2:%.*]] = load double, double* [[A_NEXT]], align 8
-; CHECK-NEXT: store double [[T1]], double* [[A_NEXT]], align 8
-; CHECK-NEXT: store double [[T2]], double* [[A]], align 8
-; CHECK-NEXT: [[C:%.*]] = icmp eq i64 [[I]], [[N:%.*]]
-; CHECK-NEXT: br i1 [[C]], label [[FINAL:%.*]], label [[FOR_BODY]]
-; CHECK: final:
-; CHECK-NEXT: ret void
-;
for.body.lr.ph:
%t = load double*, double** %arr, align 8
br label %for.body
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
-; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
; CHECK: middle.block:
; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: store i8 7, i8* [[AJP3]], align 8
; CHECK-NEXT: [[J_NEXT]] = add nuw nsw i32 [[J]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[J_NEXT]], 15
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !2
; CHECK: for.end:
; CHECK-NEXT: ret void
;
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -loop-vectorize -force-vector-width=2 -S %s | FileCheck %s
; Test cases to make sure LV & loop versioning can handle loops with
; Multiple branches exiting the loop to a unique exit block. The loop should
; be vectorized with versioning & noalias metadata should be added.
define void @multiple_exits_unique_exit_block(i32* %A, i32* %B, i64 %N) {
-; CHECK-LABEL: @multiple_exits_unique_exit_block(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8*
-; CHECK-NEXT: [[A3:%.*]] = bitcast i32* [[A:%.*]] to i8*
-; CHECK-NEXT: [[UMIN6:%.*]] = call i64 @llvm.umin.i64(i64 [[N:%.*]], i64 999)
-; CHECK-NEXT: [[TMP0:%.*]] = add nuw nsw i64 [[UMIN6]], 1
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP0]], 2
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK-LABEL: @multiple_exits_unique_exit_block
; CHECK: vector.memcheck:
-; CHECK-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[N]], i64 999)
-; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i64 [[UMIN]], 1
-; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP1]]
-; CHECK-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
-; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP1]]
-; CHECK-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8*
-; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[B1]], [[SCEVGEP45]]
-; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[A3]], [[SCEVGEP2]]
-; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 2
-; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i64 2, i64 [[N_MOD_VF]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[TMP3]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <2 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP7]], align 4, !alias.scope !0
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
-; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <2 x i32>*
-; CHECK-NEXT: store <2 x i32> [[WIDE_LOAD]], <2 x i32>* [[TMP10]], align 4, !alias.scope !3, !noalias !0
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: br label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
-; CHECK: loop.header:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY:%.*]] ]
-; CHECK-NEXT: [[COND_0:%.*]] = icmp eq i64 [[IV]], [[N]]
-; CHECK-NEXT: br i1 [[COND_0]], label [[EXIT:%.*]], label [[FOR_BODY]]
-; CHECK: for.body:
-; CHECK-NEXT: [[A_GEP:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
-; CHECK-NEXT: [[LV:%.*]] = load i32, i32* [[A_GEP]], align 4
-; CHECK-NEXT: [[B_GEP:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[IV]]
-; CHECK-NEXT: store i32 [[LV]], i32* [[B_GEP]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 1
-; CHECK-NEXT: [[COND_1:%.*]] = icmp ult i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[COND_1]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK: exit:
-; CHECK-NEXT: ret void
+; CHECK-LABEL: vector.body:
+; CHECK: %wide.load = load <2 x i32>, <2 x i32>* {{.*}}, align 4, !alias.scope
+; CHECK: store <2 x i32> %wide.load, <2 x i32>* {{.*}}, align 4, !alias.scope
+; CHECK: br
;
entry:
br label %loop.header
; Multiple branches exiting the loop to different blocks. Currently this is not supported.
define i32 @multiple_exits_multiple_exit_blocks(i32* %A, i32* %B, i64 %N) {
-; CHECK-LABEL: @multiple_exits_multiple_exit_blocks(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[B1:%.*]] = bitcast i32* [[B:%.*]] to i8*
-; CHECK-NEXT: [[A3:%.*]] = bitcast i32* [[A:%.*]] to i8*
-; CHECK-NEXT: [[UMIN6:%.*]] = call i64 @llvm.umin.i64(i64 [[N:%.*]], i64 999)
-; CHECK-NEXT: [[TMP0:%.*]] = add nuw nsw i64 [[UMIN6]], 1
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP0]], 2
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK: vector.memcheck:
-; CHECK-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[N]], i64 999)
-; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i64 [[UMIN]], 1
-; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP1]]
-; CHECK-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
-; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP1]]
-; CHECK-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8*
-; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[B1]], [[SCEVGEP45]]
-; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[A3]], [[SCEVGEP2]]
-; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 2
-; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
-; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i64 2, i64 [[N_MOD_VF]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[TMP3]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <2 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP7]], align 4, !alias.scope !8
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0
-; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <2 x i32>*
-; CHECK-NEXT: store <2 x i32> [[WIDE_LOAD]], <2 x i32>* [[TMP10]], align 4, !alias.scope !11, !noalias !8
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: br label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
-; CHECK: loop.header:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY:%.*]] ]
-; CHECK-NEXT: [[COND_0:%.*]] = icmp eq i64 [[IV]], [[N]]
-; CHECK-NEXT: br i1 [[COND_0]], label [[EXIT_0:%.*]], label [[FOR_BODY]]
-; CHECK: for.body:
-; CHECK-NEXT: [[A_GEP:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
-; CHECK-NEXT: [[LV:%.*]] = load i32, i32* [[A_GEP]], align 4
-; CHECK-NEXT: [[B_GEP:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[IV]]
-; CHECK-NEXT: store i32 [[LV]], i32* [[B_GEP]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 1
-; CHECK-NEXT: [[COND_1:%.*]] = icmp ult i64 [[IV_NEXT]], 1000
-; CHECK-NEXT: br i1 [[COND_1]], label [[LOOP_HEADER]], label [[EXIT_1:%.*]], !llvm.loop [[LOOP14:![0-9]+]]
-; CHECK: exit.0:
-; CHECK-NEXT: ret i32 1
-; CHECK: exit.1:
-; CHECK-NEXT: ret i32 2
+; CHECK-LABEL: @multiple_exits_multiple_exit_blocks
+; CHECK-NEXT: entry:
+; CHECK: br label %loop.header
+; CHECK-NOT: <2 x i32>
;
entry:
br label %loop.header
; CHECK: .lr.ph:
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[DOTLR_PH]] ], [ 0, [[DOTLR_PH_PREHEADER]] ]
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[DOTLR_PH]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE_LOOPEXIT:%.*]], label [[DOTLR_PH]], !llvm.loop !0
; CHECK: ._crit_edge.loopexit:
; CHECK-NEXT: br label [[DOT_CRIT_EDGE]]
; CHECK: ._crit_edge:
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -loop-vectorize -transform-warning -S 2>&1 | FileCheck %s
; Verify warning is generated when vectorization/ interleaving is explicitly specified and fails to occur.
; Function Attrs: nounwind ssp uwtable
define void @_Z4testPiS_i(i32* nocapture %A, i32* nocapture %B, i32 %number) #0 !dbg !4 {
-; CHECK-LABEL: @_Z4testPiS_i(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP25:%.*]] = icmp sgt i32 [[NUMBER:%.*]], 0, !dbg [[DBG8:![0-9]+]]
-; CHECK-NEXT: br i1 [[CMP25]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END15:%.*]], !dbg [[DBG8]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK: for.body.preheader:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]], !dbg [[DBG12:![0-9]+]]
-; CHECK: for.cond5.preheader:
-; CHECK-NEXT: br i1 [[CMP25]], label [[FOR_BODY7_PREHEADER:%.*]], label [[FOR_END15]], !dbg [[DBG14:![0-9]+]], !llvm.loop [[LOOP16:![0-9]+]]
-; CHECK: for.body7.preheader:
-; CHECK-NEXT: br label [[FOR_BODY7:%.*]], !dbg [[DBG18:![0-9]+]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV27:%.*]] = phi i64 [ [[INDVARS_IV_NEXT28:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV27]], !dbg [[DBG12]]
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4, !dbg [[DBG12]], !tbaa [[TBAA20:![0-9]+]]
-; CHECK-NEXT: [[IDXPROM1:%.*]] = sext i32 [[TMP0]] to i64, !dbg [[DBG12]]
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[IDXPROM1]], !dbg [[DBG12]]
-; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4, !dbg [[DBG12]], !tbaa [[TBAA20]]
-; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP1]], 1, !dbg [[DBG12]]
-; CHECK-NEXT: store i32 [[INC]], i32* [[ARRAYIDX2]], align 4, !dbg [[DBG12]], !tbaa [[TBAA20]]
-; CHECK-NEXT: [[INDVARS_IV_NEXT28]] = add nuw nsw i64 [[INDVARS_IV27]], 1, !dbg [[DBG8]]
-; CHECK-NEXT: [[LFTR_WIDEIV29:%.*]] = trunc i64 [[INDVARS_IV_NEXT28]] to i32, !dbg [[DBG8]]
-; CHECK-NEXT: [[EXITCOND30:%.*]] = icmp eq i32 [[LFTR_WIDEIV29]], [[NUMBER]], !dbg [[DBG8]]
-; CHECK-NEXT: br i1 [[EXITCOND30]], label [[FOR_COND5_PREHEADER:%.*]], label [[FOR_BODY]], !dbg [[DBG8]], !llvm.loop [[LOOP10]]
-; CHECK: for.body7:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY7]] ], [ 0, [[FOR_BODY7_PREHEADER]] ]
-; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]], !dbg [[DBG18]]
-; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX9]], align 4, !dbg [[DBG18]], !tbaa [[TBAA20]]
-; CHECK-NEXT: [[IDXPROM10:%.*]] = sext i32 [[TMP2]] to i64, !dbg [[DBG18]]
-; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[IDXPROM10]], !dbg [[DBG18]]
-; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX11]], align 4, !dbg [[DBG18]], !tbaa [[TBAA20]]
-; CHECK-NEXT: [[INC12:%.*]] = add nsw i32 [[TMP3]], 1, !dbg [[DBG18]]
-; CHECK-NEXT: store i32 [[INC12]], i32* [[ARRAYIDX11]], align 4, !dbg [[DBG18]], !tbaa [[TBAA20]]
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1, !dbg [[DBG14]]
-; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32, !dbg [[DBG14]]
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[NUMBER]], !dbg [[DBG14]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END15_LOOPEXIT:%.*]], label [[FOR_BODY7]], !dbg [[DBG14]], !llvm.loop [[LOOP16]]
-; CHECK: for.end15.loopexit:
-; CHECK-NEXT: br label [[FOR_END15]]
-; CHECK: for.end15:
-; CHECK-NEXT: ret void, !dbg [[DBG24:![0-9]+]]
-;
entry:
%cmp25 = icmp sgt i32 %number, 0, !dbg !10
br i1 %cmp25, label %for.body.preheader, label %for.end15, !dbg !10, !llvm.loop !12
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S < %s | FileCheck %s
@a = common global [128 x i32] zeroinitializer, align 16
;; Must not vectorize division reduction. Division is lossy.
define i32 @g() {
-; CHECK-LABEL: @g(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[R_05:%.*]] = phi i32 [ 80, [[ENTRY]] ], [ [[DIV:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [128 x i32], [128 x i32]* @a, i64 0, i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[DIV]] = sdiv i32 [[R_05]], [[TMP0]]
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], 1024
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK: for.end:
-; CHECK-NEXT: [[DIV_LCSSA:%.*]] = phi i32 [ [[DIV]], [[FOR_BODY]] ]
-; CHECK-NEXT: ret i32 [[DIV_LCSSA]]
-;
entry:
br label %for.body
for.body:
+ ; CHECK-LABEL: @g(
+ ; CHECK-NOT: sdiv <2 x i32>
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%r.05 = phi i32 [ 80, %entry ], [ %div, %for.body ]
%arrayidx = getelementptr inbounds [128 x i32], [128 x i32]* @a, i64 0, i64 %indvars.iv
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -basic-aa -scoped-noalias-aa -loop-vectorize -licm -force-vector-width=2 \
; RUN: -force-vector-interleave=1 -S < %s | FileCheck %s
define void @f(i32* %a, i32* %b, i32* %c) {
-; CHECK-LABEL: @f(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[A1:%.*]] = bitcast i32* [[A:%.*]] to i8*
-; CHECK-NEXT: [[C7:%.*]] = bitcast i32* [[C:%.*]] to i8*
-; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A]], i64 20
-; CHECK-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
-; CHECK-NEXT: [[SCEVGEP8:%.*]] = getelementptr i32, i32* [[C]], i64 20
-; CHECK-NEXT: [[SCEVGEP89:%.*]] = bitcast i32* [[SCEVGEP8]] to i8*
-; CHECK-NEXT: [[BOUND010:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP89]]
-; CHECK-NEXT: [[BOUND111:%.*]] = icmp ult i8* [[C7]], [[SCEVGEP2]]
-; CHECK-NEXT: [[FOUND_CONFLICT12:%.*]] = and i1 [[BOUND010]], [[BOUND111]]
-; CHECK-NEXT: br label [[OUTER:%.*]]
-; CHECK: outer:
-; CHECK-NEXT: [[I_2:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[I:%.*]], [[INNER_END:%.*]] ]
-; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[I_2]]
-; CHECK-NEXT: [[SCEVGEP34:%.*]] = bitcast i32* [[SCEVGEP3]] to i8*
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[I_2]], 1
-; CHECK-NEXT: [[SCEVGEP5:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP0]]
-; CHECK-NEXT: [[SCEVGEP56:%.*]] = bitcast i32* [[SCEVGEP5]] to i8*
-; CHECK-NEXT: [[ARRAYIDXB:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I_2]]
-; CHECK-NEXT: br label [[INNER_PH:%.*]]
-; CHECK: inner.ph:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK: vector.memcheck:
-; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP56]]
-; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[SCEVGEP34]], [[SCEVGEP2]]
-; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT12]]
-; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDXB]], align 4, !alias.scope !0
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TMP1]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP2]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <2 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP5]], align 4, !alias.scope !3, !noalias !5
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[TMP2]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <2 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD13:%.*]] = load <2 x i32>, <2 x i32>* [[TMP8]], align 4, !alias.scope !7
-; CHECK-NEXT: [[TMP9:%.*]] = add nuw <2 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: [[TMP10:%.*]] = add nuw <2 x i32> [[TMP9]], [[WIDE_LOAD13]]
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP4]] to <2 x i32>*
-; CHECK-NEXT: store <2 x i32> [[TMP10]], <2 x i32>* [[TMP11]], align 4, !alias.scope !3, !noalias !5
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 20
-; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: br i1 true, label [[INNER_END]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 20, [[MIDDLE_BLOCK]] ], [ 0, [[INNER_PH]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT: br label [[INNER:%.*]]
-; CHECK: inner:
-; CHECK-NEXT: [[J_2:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[J:%.*]], [[INNER]] ]
-; CHECK-NEXT: [[ARRAYIDXA:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[J_2]]
-; CHECK-NEXT: [[LOADA:%.*]] = load i32, i32* [[ARRAYIDXA]], align 4
-; CHECK-NEXT: [[LOADB:%.*]] = load i32, i32* [[ARRAYIDXB]], align 4
-; CHECK-NEXT: [[ARRAYIDXC:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[J_2]]
-; CHECK-NEXT: [[LOADC:%.*]] = load i32, i32* [[ARRAYIDXC]], align 4
-; CHECK-NEXT: [[ADD:%.*]] = add nuw i32 [[LOADA]], [[LOADB]]
-; CHECK-NEXT: [[ADD2:%.*]] = add nuw i32 [[ADD]], [[LOADC]]
-; CHECK-NEXT: store i32 [[ADD2]], i32* [[ARRAYIDXA]], align 4
-; CHECK-NEXT: [[J]] = add nuw nsw i64 [[J_2]], 1
-; CHECK-NEXT: [[COND1:%.*]] = icmp eq i64 [[J]], 20
-; CHECK-NEXT: br i1 [[COND1]], label [[INNER_END_LOOPEXIT:%.*]], label [[INNER]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK: inner.end.loopexit:
-; CHECK-NEXT: br label [[INNER_END]]
-; CHECK: inner.end:
-; CHECK-NEXT: [[I]] = add nuw nsw i64 [[I_2]], 1
-; CHECK-NEXT: [[COND2:%.*]] = icmp eq i64 [[I]], 30
-; CHECK-NEXT: br i1 [[COND2]], label [[OUTER_END:%.*]], label [[OUTER]]
-; CHECK: outer.end:
-; CHECK-NEXT: ret void
-;
entry:
br label %outer
br label %inner.ph
inner.ph:
+; CHECK: vector.ph:
+; CHECK: load i32, i32* %arrayidxB,
+; CHECK: br label %vector.body
br label %inner
inner:
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -S | FileCheck %s
define void @test1(float* noalias nocapture %a, float* noalias nocapture readonly %b) {
-; CHECK-LABEL: @test1(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[TMP4]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 4
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4
-; CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META0:![0-9]+]])
-; CHECK-NEXT: [[TMP8:%.*]] = fadd <4 x float> [[WIDE_LOAD]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; CHECK-NEXT: [[TMP9:%.*]] = fadd <4 x float> [[WIDE_LOAD1]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP8]], <4 x float>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 4
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP9]], <4 x float>* [[TMP15]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1600
-; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1600, 1600
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1600, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP17:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP17]], 1.000000e+02
-; CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META0]])
-; CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP17]], 1.000000e+00
-; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store float [[ADD]], float* [[ARRAYIDX5]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV]], 1599
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
entry:
br label %for.body
+; CHECK-LABEL: @test1
+; CHECK: vector.body:
+; CHECK: @llvm.experimental.noalias.scope.decl
+; CHECK-NOT: @llvm.experimental.noalias.scope.decl
+; CHECK: for.body:
+; CHECK: @llvm.experimental.noalias.scope.decl
+; CHECK-NOT: @llvm.experimental.noalias.scope.decl
+; CHECK: ret void
for.body: ; preds = %for.body, %entry
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%struct.data = type { float*, float* }
define void @test2(%struct.data* nocapture readonly %d) {
-; CHECK-LABEL: @test2(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_DATA:%.*]], %struct.data* [[D:%.*]], i64 0, i32 1
-; CHECK-NEXT: [[TMP0:%.*]] = load float*, float** [[B]], align 8
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[TMP0]] to i8*
-; CHECK-NEXT: [[PTRINT:%.*]] = ptrtoint float* [[TMP0]] to i64
-; CHECK-NEXT: [[MASKEDPTR:%.*]] = and i64 [[PTRINT]], 31
-; CHECK-NEXT: [[MASKCOND:%.*]] = icmp eq i64 [[MASKEDPTR]], 0
-; CHECK-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT_DATA]], %struct.data* [[D]], i64 0, i32 0
-; CHECK-NEXT: [[TMP2:%.*]] = load float*, float** [[A]], align 8
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to i8*
-; CHECK-NEXT: [[PTRINT2:%.*]] = ptrtoint float* [[TMP2]] to i64
-; CHECK-NEXT: [[MASKEDPTR3:%.*]] = and i64 [[PTRINT2]], 31
-; CHECK-NEXT: [[MASKCOND4:%.*]] = icmp eq i64 [[MASKEDPTR3]], 0
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK: vector.memcheck:
-; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[TMP2]], i64 1600
-; CHECK-NEXT: [[SCEVGEP1:%.*]] = bitcast float* [[SCEVGEP]] to i8*
-; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr float, float* [[TMP0]], i64 1600
-; CHECK-NEXT: [[SCEVGEP23:%.*]] = bitcast float* [[SCEVGEP2]] to i8*
-; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[TMP3]], [[SCEVGEP23]]
-; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[TMP1]], [[SCEVGEP1]]
-; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META0]])
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP0]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[TMP0]], i64 [[TMP5]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 0
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP9]], align 4, !alias.scope !7
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP6]], i32 4
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast float* [[TMP10]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x float>, <4 x float>* [[TMP11]], align 4, !alias.scope !7
-; CHECK-NEXT: [[TMP12:%.*]] = fadd <4 x float> [[WIDE_LOAD]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; CHECK-NEXT: [[TMP13:%.*]] = fadd <4 x float> [[WIDE_LOAD4]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META10:![0-9]+]])
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP2]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP2]], i64 [[TMP5]]
-; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP14]], i32 0
-; CHECK-NEXT: [[TMP17:%.*]] = bitcast float* [[TMP16]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP12]], <4 x float>* [[TMP17]], align 4, !alias.scope !12, !noalias !7
-; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP14]], i32 4
-; CHECK-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP18]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP13]], <4 x float>* [[TMP19]], align 4, !alias.scope !12, !noalias !7
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1600
-; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1600, 1600
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1600, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META0]])
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[TMP0]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP21:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP21]], 1.000000e+00
-; CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META10]])
-; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[TMP2]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store float [[ADD]], float* [[ARRAYIDX5]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV]], 1599
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
entry:
%b = getelementptr inbounds %struct.data, %struct.data* %d, i64 0, i32 1
%0 = load float*, float** %b, align 8
%maskcond4 = icmp eq i64 %maskedptr3, 0
br label %for.body
+; CHECK-LABEL: @test2
+; CHECK: vector.body:
+; CHECK: @llvm.experimental.noalias.scope.decl(metadata [[SCOPE0_LIST:!.*]])
+; CHECK: @llvm.experimental.noalias.scope.decl(metadata [[SCOPE4_LIST:!.*]])
+; CHECK-NOT: @llvm.experimental.noalias.scope.decl
+; CHECK: for.body:
+; CHECK: @llvm.experimental.noalias.scope.decl(metadata [[SCOPE0_LIST]])
+; CHECK: @llvm.experimental.noalias.scope.decl(metadata [[SCOPE4_LIST]])
+; CHECK-NOT: @llvm.experimental.noalias.scope.decl
+; CHECK: ret void
for.body: ; preds = %for.body, %entry
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
}
define void @predicated_noalias_scope_decl(float* noalias nocapture readonly %a, float* noalias nocapture %b, i32 %n) {
+
; Check that the vector.body still contains a llvm.experimental.noalias.scope.decl
+
; CHECK-LABEL: @predicated_noalias_scope_decl(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP15:%.*]] = icmp eq i32 [[N:%.*]], 0
-; CHECK-NEXT: br i1 [[CMP15]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
-; CHECK: for.body.preheader:
-; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 8
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP3:%.*]] = icmp ult <4 x i64> [[VEC_IND]], <i64 495616, i64 495616, i64 495616, i64 495616>
-; CHECK-NEXT: [[TMP4:%.*]] = icmp ult <4 x i64> [[STEP_ADD]], <i64 495616, i64 495616, i64 495616, i64 495616>
-; CHECK-NEXT: [[TMP5:%.*]] = icmp ult <4 x i64> [[VEC_IND]], <i64 991232, i64 991232, i64 991232, i64 991232>
-; CHECK-NEXT: [[TMP6:%.*]] = icmp ult <4 x i64> [[STEP_ADD]], <i64 991232, i64 991232, i64 991232, i64 991232>
-; CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META0]])
-; CHECK-NEXT: [[TMP7:%.*]] = xor <4 x i1> [[TMP3]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT: [[TMP8:%.*]] = xor <4 x i1> [[TMP4]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP3]], <4 x float> <float 2.300000e+01, float 2.300000e+01, float 2.300000e+01, float 2.300000e+01>, <4 x float> <float 4.200000e+01, float 4.200000e+01, float 4.200000e+01, float 4.200000e+01>
-; CHECK-NEXT: [[PREDPHI2:%.*]] = select <4 x i1> [[TMP4]], <4 x float> <float 2.300000e+01, float 2.300000e+01, float 2.300000e+01, float 2.300000e+01>, <4 x float> <float 4.200000e+01, float 4.200000e+01, float 4.200000e+01, float 4.200000e+01>
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP2]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[TMP9]], i32 0
-; CHECK-NEXT: [[TMP12:%.*]] = bitcast float* [[TMP11]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP12]], align 4
-; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[TMP9]], i32 4
-; CHECK-NEXT: [[TMP14:%.*]] = bitcast float* [[TMP13]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x float>, <4 x float>* [[TMP14]], align 4
-; CHECK-NEXT: [[TMP15:%.*]] = fmul <4 x float> [[PREDPHI]], [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP16:%.*]] = fmul <4 x float> [[PREDPHI2]], [[WIDE_LOAD3]]
-; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP2]]
-; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float* [[TMP17]], i32 0
-; CHECK-NEXT: [[TMP20:%.*]] = bitcast float* [[TMP19]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP15]], <4 x float>* [[TMP20]], align 4
-; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[TMP17]], i32 4
-; CHECK-NEXT: [[TMP22:%.*]] = bitcast float* [[TMP21]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP16]], <4 x float>* [[TMP22]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.cond.cleanup.loopexit:
-; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: ret void
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[IF_END5:%.*]] ]
-; CHECK-NEXT: [[CMP1:%.*]] = icmp ult i64 [[INDVARS_IV]], 495616
-; CHECK-NEXT: br i1 [[CMP1]], label [[IF_END5]], label [[IF_ELSE:%.*]]
-; CHECK: if.else:
-; CHECK-NEXT: [[CMP2:%.*]] = icmp ult i64 [[INDVARS_IV]], 991232
-; CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META0]])
-; CHECK-NEXT: br label [[IF_END5]]
-; CHECK: if.end5:
-; CHECK-NEXT: [[X_0:%.*]] = phi float [ 4.200000e+01, [[IF_ELSE]] ], [ 2.300000e+01, [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP24:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[MUL:%.*]] = fmul float [[X_0]], [[TMP24]]
-; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store float [[MUL]], float* [[ARRAYIDX7]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[TMP0]]
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-;
+; CHECK: vector.body:
+; CHECK: call void @llvm.experimental.noalias.scope.decl
+; CHECK-NOT: @llvm.experimental.noalias.scope.decl
+; CHECK: scalar.ph:
+; CHECK-NOT: @llvm.experimental.noalias.scope.decl
+; CHECK: if.else:
+; CHECK: call void @llvm.experimental.noalias.scope.decl
+; CHECK-NOT: @llvm.experimental.noalias.scope.decl
+; CHECK: }
entry:
%cmp15 = icmp eq i32 %n, 0
!3 = distinct !{ !3, !2 }
!4 = !{ !3 }
+; CHECK: [[SCOPE0_LIST]] = !{[[SCOPE0:!.*]]}
+; CHECK: [[SCOPE0]] = distinct !{[[SCOPE0]], [[SCOPE0_DOM:!.*]]}
+; CHECK: [[SCOPE0_DOM]] = distinct !{[[SCOPE0_DOM]]}
+; CHECK: [[SCOPE4_LIST]] = !{[[SCOPE4:!.*]]}
+; CHECK: [[SCOPE4]] = distinct !{[[SCOPE4]], [[SCOPE0_DOM]]}
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -passes="loop-vectorize,jump-threading" -debug-pass-manager < %s 2>&1 | FileCheck %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
; vectorization happens, and the only change LV makes is LCSSA formation.
define i32 @novect(i32* %p) {
-; CHECK-LABEL: @novect(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[MIDDLE:%.*]]
-; CHECK: middle:
-; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[MIDDLE]] ]
-; CHECK-NEXT: [[X:%.*]] = load volatile i32, i32* [[P:%.*]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT: [[COND:%.*]] = icmp slt i32 [[IV]], 1000
-; CHECK-NEXT: br i1 [[COND]], label [[EXIT:%.*]], label [[MIDDLE]]
-; CHECK: exit:
-; CHECK-NEXT: [[X_LCSSA:%.*]] = phi i32 [ [[X]], [[MIDDLE]] ]
-; CHECK-NEXT: ret i32 [[X_LCSSA]]
-;
+; CHECK: Running pass: LoopVectorizePass on novect
+; CHECK: Clearing all analysis results for: <possibly invalidated loop>
+; CHECK: Invalidating analysis: ScalarEvolutionAnalysis on novect
+; CHECK-NOT: Invalidating analysis: BranchProbabilityAnalysis on novect
+; CHECK-NOT: Invalidating analysis: BlockFrequencyAnalysis on novect
+; CHECK: Invalidating analysis: DemandedBitsAnalysis on novect
+; CHECK: Running pass: JumpThreadingPass on novect
+
+; CHECK: entry:
+; CHECK: br label %middle
+; CHECK: middle:
+; CHECK: %iv = phi i32 [ 0, %entry ], [ %iv.next, %middle ]
+; CHECK: %x = load volatile i32, i32* %p
+; CHECK: %iv.next = add i32 %iv, 1
+; CHECK: %cond = icmp slt i32 %iv, 1000
+; CHECK: br i1 %cond, label %exit, label %middle
+; CHECK: exit:
+; CHECK: %x.lcssa = phi i32 [ %x, %middle ]
+; CHECK: ret i32 %x.lcssa
entry:
br label %middle
; VF-TWO-CHECK-NEXT: [[TMP7:%.*]] = add nsw <2 x i32> [[WIDE_LOAD]], [[WIDE_LOAD2]]
; VF-TWO-CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; VF-TWO-CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; VF-TWO-CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF-TWO-CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
; VF-TWO-CHECK: middle.block:
; VF-TWO-CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; VF-TWO-CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32> [[TMP7]], i32 1
; VF-TWO-CHECK-NEXT: [[N_VEC5:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF4]]
; VF-TWO-CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; VF-TWO-CHECK: vec.epilog.vector.body:
-; VF-TWO-CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT10:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; VF-TWO-CHECK-NEXT: [[TMP10:%.*]] = add i64 [[OFFSET_IDX]], 0
+; VF-TWO-CHECK-NEXT: [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT7:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; VF-TWO-CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX6]], 0
; VF-TWO-CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP10]]
; VF-TWO-CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0
; VF-TWO-CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <2 x i32>*
-; VF-TWO-CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <2 x i32>, <2 x i32>* [[TMP13]], align 4
+; VF-TWO-CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <2 x i32>, <2 x i32>* [[TMP13]], align 4
; VF-TWO-CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP10]]
; VF-TWO-CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP14]], i32 0
; VF-TWO-CHECK-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <2 x i32>*
-; VF-TWO-CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <2 x i32>, <2 x i32>* [[TMP16]], align 4
-; VF-TWO-CHECK-NEXT: [[TMP17:%.*]] = add nsw <2 x i32> [[WIDE_LOAD8]], [[WIDE_LOAD9]]
-; VF-TWO-CHECK-NEXT: [[INDEX_NEXT10]] = add nuw i64 [[OFFSET_IDX]], 2
-; VF-TWO-CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT10]], [[N_VEC5]]
-; VF-TWO-CHECK-NEXT: br i1 [[TMP18]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; VF-TWO-CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <2 x i32>, <2 x i32>* [[TMP16]], align 4
+; VF-TWO-CHECK-NEXT: [[TMP17:%.*]] = add nsw <2 x i32> [[WIDE_LOAD9]], [[WIDE_LOAD10]]
+; VF-TWO-CHECK-NEXT: [[INDEX_NEXT7]] = add nuw i64 [[INDEX6]], 2
+; VF-TWO-CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT7]], [[N_VEC5]]
+; VF-TWO-CHECK-NEXT: br i1 [[TMP18]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], [[LOOP2:!llvm.loop !.*]]
; VF-TWO-CHECK: vec.epilog.middle.block:
-; VF-TWO-CHECK-NEXT: [[CMP_N6:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC5]]
+; VF-TWO-CHECK-NEXT: [[CMP_N8:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC5]]
; VF-TWO-CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x i32> [[TMP17]], i32 1
-; VF-TWO-CHECK-NEXT: br i1 [[CMP_N6]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
+; VF-TWO-CHECK-NEXT: br i1 [[CMP_N8]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
; VF-TWO-CHECK: vec.epilog.scalar.ph:
; VF-TWO-CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC5]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ]
; VF-TWO-CHECK-NEXT: br label [[FOR_BODY:%.*]]
; VF-TWO-CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
; VF-TWO-CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; VF-TWO-CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; VF-TWO-CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOP4:![0-9]+]]
+; VF-TWO-CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT_LOOPEXIT]], [[LOOP4:!llvm.loop !.*]]
; VF-TWO-CHECK: for.end.loopexit.loopexit:
; VF-TWO-CHECK-NEXT: [[ADD_LCSSA3:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP19]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
; VF-TWO-CHECK-NEXT: br label [[FOR_END_LOOPEXIT]]
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; REQUIRES: asserts
; RUN: opt < %s -passes='loop-vectorize' -force-target-supports-scalable-vectors=true -enable-epilogue-vectorization -epilogue-vectorization-force-VF=2 --debug-only=loop-vectorize -S -scalable-vectorization=on 2>&1 | FileCheck %s
; CHECK: Epilogue Loop VF:2, Epilogue Loop UF:1
define void @f1(i8* %A) {
-; CHECK-LABEL: @f1(
-; CHECK-NEXT: iter.check:
-; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; CHECK: vector.main.loop.iter.check:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[TMP5]], i32 0
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <vscale x 4 x i8>*
-; CHECK-NEXT: store <vscale x 4 x i8> shufflevector (<vscale x 4 x i8> insertelement (<vscale x 4 x i8> poison, i8 1, i32 0), <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i8>* [[TMP7]], align 1
-; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
-; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
-; CHECK: vec.epilog.iter.check:
-; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 1024, [[N_VEC]]
-; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 2
-; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
-; CHECK: vec.epilog.ph:
-; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
-; CHECK: vec.epilog.vector.body:
-; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[TMP11]]
-; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, i8* [[TMP12]], i32 0
-; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8* [[TMP13]] to <2 x i8>*
-; CHECK-NEXT: store <2 x i8> <i8 1, i8 1>, <2 x i8>* [[TMP14]], align 1
-; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[OFFSET_IDX]], 2
-; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 1024
-; CHECK-NEXT: br i1 [[TMP15]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: vec.epilog.middle.block:
-; CHECK-NEXT: [[CMP_N1:%.*]] = icmp eq i64 1024, 1024
-; CHECK-NEXT: br i1 [[CMP_N1]], label [[EXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
-; CHECK: vec.epilog.scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[IV]]
-; CHECK-NEXT: store i8 1, i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[IV_NEXT]], 1024
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK: exit.loopexit:
-; CHECK-NEXT: br label [[EXIT]]
-; CHECK: exit:
-; CHECK-NEXT: ret void
-;
entry:
br label %for.body
; CHECK-NEXT: [[N_VEC4:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF3]]
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; CHECK: vec.epilog.vector.body:
-; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[INDEX5]], 0
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[BB]], i64 [[TMP12]]
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP13]], i32 0
; CHECK-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x float>, <4 x float>* [[TMP15]], align 4
+; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x float>, <4 x float>* [[TMP15]], align 4
; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[CC]], i64 [[TMP12]]
; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* [[TMP16]], i32 0
; CHECK-NEXT: [[TMP18:%.*]] = bitcast float* [[TMP17]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x float>, <4 x float>* [[TMP18]], align 4
-; CHECK-NEXT: [[TMP19:%.*]] = fadd fast <4 x float> [[WIDE_LOAD7]], [[WIDE_LOAD8]]
+; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x float>, <4 x float>* [[TMP18]], align 4
+; CHECK-NEXT: [[TMP19:%.*]] = fadd fast <4 x float> [[WIDE_LOAD8]], [[WIDE_LOAD9]]
; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[AA]], i64 [[TMP12]]
; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 0
; CHECK-NEXT: [[TMP22:%.*]] = bitcast float* [[TMP21]] to <4 x float>*
; CHECK-NEXT: store <4 x float> [[TMP19]], <4 x float>* [[TMP22]], align 4
-; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[OFFSET_IDX]], 4
-; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC4]]
+; CHECK-NEXT: [[INDEX_NEXT6]] = add nuw i64 [[INDEX5]], 4
+; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC4]]
; CHECK-NEXT: br i1 [[TMP23]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
; CHECK: vec.epilog.middle.block:
-; CHECK-NEXT: [[CMP_N5:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC4]]
-; CHECK-NEXT: br i1 [[CMP_N5]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK-NEXT: [[CMP_N7:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC4]]
+; CHECK-NEXT: br i1 [[CMP_N7]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
; CHECK: vec.epilog.scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC4]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP2]])
; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
-; CHECK-NEXT: [[TMP3:%.*]] = sub i32 [[TMP0]], [[MUL_RESULT]]
-; CHECK-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP3]], [[TMP0]]
-; CHECK-NEXT: [[TMP5:%.*]] = or i1 [[TMP4]], [[MUL_OVERFLOW]]
-; CHECK-NEXT: [[TMP6:%.*]] = icmp ugt i64 [[TMP1]], 4294967295
-; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
-; CHECK-NEXT: br i1 [[TMP7]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK-NEXT: [[TMP4:%.*]] = sub i32 [[TMP0]], [[MUL_RESULT]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp sgt i32 [[TMP4]], [[TMP0]]
+; CHECK-NEXT: [[TMP9:%.*]] = or i1 [[TMP5]], [[MUL_OVERFLOW]]
+; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[TMP1]], 4294967295
+; CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP9]], [[TMP8]]
+; CHECK-NEXT: br i1 [[TMP10]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
; CHECK: vector.main.loop.iter.check:
; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32
-; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[OFFSET_IDX]], 0
-; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP10:%.*]] = xor i32 [[TMP8]], -1
-; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], [[N]]
-; CHECK-NEXT: [[TMP12:%.*]] = sext i32 [[TMP11]] to i64
-; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP12]]
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP13]], i32 0
-; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, float* [[TMP14]], i32 -3
-; CHECK-NEXT: [[TMP16:%.*]] = bitcast float* [[TMP15]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP16]], align 4
-; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x float> [[WIDE_LOAD]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[TMP17:%.*]] = fadd fast <4 x float> [[REVERSE]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP9]]
-; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float* [[TMP18]], i32 0
+; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP14:%.*]] = xor i32 [[TMP12]], -1
+; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP14]], [[N]]
+; CHECK-NEXT: [[TMP16:%.*]] = sext i32 [[TMP15]] to i64
+; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP16]]
+; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[TMP17]], i32 0
+; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float* [[TMP18]], i32 -3
; CHECK-NEXT: [[TMP20:%.*]] = bitcast float* [[TMP19]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP17]], <4 x float>* [[TMP20]], align 4
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP20]], align 4
+; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x float> [[WIDE_LOAD]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT: [[TMP21:%.*]] = fadd fast <4 x float> [[REVERSE]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP13]]
+; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, float* [[TMP22]], i32 0
+; CHECK-NEXT: [[TMP24:%.*]] = bitcast float* [[TMP23]] to <4 x float>*
+; CHECK-NEXT: store <4 x float> [[TMP21]], <4 x float>* [[TMP24]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
; CHECK: vec.epilog.iter.check:
-; CHECK-NEXT: [[IND_END5:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT: [[IND_END7:%.*]] = trunc i64 [[N_VEC]] to i32
; CHECK-NEXT: [[N_VEC_REMAINING:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], 4
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
; CHECK-NEXT: [[IND_END:%.*]] = trunc i64 [[N_VEC3]] to i32
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; CHECK: vec.epilog.vector.body:
-; CHECK-NEXT: [[OFFSET_IDX9:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT12:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT: [[OFFSET_IDX8:%.*]] = trunc i64 [[OFFSET_IDX9]] to i32
-; CHECK-NEXT: [[TMP22:%.*]] = add i32 [[OFFSET_IDX8]], 0
-; CHECK-NEXT: [[TMP23:%.*]] = add i64 [[OFFSET_IDX9]], 0
-; CHECK-NEXT: [[TMP24:%.*]] = xor i32 [[TMP22]], -1
-; CHECK-NEXT: [[TMP25:%.*]] = add i32 [[TMP24]], [[N]]
-; CHECK-NEXT: [[TMP26:%.*]] = sext i32 [[TMP25]] to i64
-; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP26]]
-; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP27]], i32 0
-; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float* [[TMP28]], i32 -3
-; CHECK-NEXT: [[TMP30:%.*]] = bitcast float* [[TMP29]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x float>, <4 x float>* [[TMP30]], align 4
-; CHECK-NEXT: [[REVERSE11:%.*]] = shufflevector <4 x float> [[WIDE_LOAD10]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[TMP31:%.*]] = fadd fast <4 x float> [[REVERSE11]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP23]]
-; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, float* [[TMP32]], i32 0
+; CHECK-NEXT: [[INDEX4:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT5:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT: [[OFFSET_IDX9:%.*]] = trunc i64 [[INDEX4]] to i32
+; CHECK-NEXT: [[TMP26:%.*]] = add i32 [[OFFSET_IDX9]], 0
+; CHECK-NEXT: [[TMP27:%.*]] = add i64 [[INDEX4]], 0
+; CHECK-NEXT: [[TMP28:%.*]] = xor i32 [[TMP26]], -1
+; CHECK-NEXT: [[TMP29:%.*]] = add i32 [[TMP28]], [[N]]
+; CHECK-NEXT: [[TMP30:%.*]] = sext i32 [[TMP29]] to i64
+; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP30]]
+; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[TMP31]], i32 0
+; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, float* [[TMP32]], i32 -3
; CHECK-NEXT: [[TMP34:%.*]] = bitcast float* [[TMP33]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP31]], <4 x float>* [[TMP34]], align 4
-; CHECK-NEXT: [[INDEX_NEXT12]] = add nuw i64 [[OFFSET_IDX9]], 4
-; CHECK-NEXT: [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT12]], [[N_VEC3]]
-; CHECK-NEXT: br i1 [[TMP35]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x float>, <4 x float>* [[TMP34]], align 4
+; CHECK-NEXT: [[REVERSE11:%.*]] = shufflevector <4 x float> [[WIDE_LOAD10]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT: [[TMP35:%.*]] = fadd fast <4 x float> [[REVERSE11]], <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP27]]
+; CHECK-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, float* [[TMP36]], i32 0
+; CHECK-NEXT: [[TMP38:%.*]] = bitcast float* [[TMP37]] to <4 x float>*
+; CHECK-NEXT: store <4 x float> [[TMP35]], <4 x float>* [[TMP38]], align 4
+; CHECK-NEXT: [[INDEX_NEXT5]] = add nuw i64 [[INDEX4]], 4
+; CHECK-NEXT: [[TMP39:%.*]] = icmp eq i64 [[INDEX_NEXT5]], [[N_VEC3]]
+; CHECK-NEXT: br i1 [[TMP39]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
; CHECK: vec.epilog.middle.block:
-; CHECK-NEXT: [[CMP_N6:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC3]]
-; CHECK-NEXT: br i1 [[CMP_N6]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK-NEXT: [[CMP_N8:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC3]]
+; CHECK-NEXT: br i1 [[CMP_N8]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
; CHECK: vec.epilog.scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ]
-; CHECK-NEXT: [[BC_RESUME_VAL4:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END5]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ]
+; CHECK-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i32 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END7]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[ITER_CHECK]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[I_014:%.*]] = phi i32 [ [[BC_RESUME_VAL4]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP36:%.*]] = xor i32 [[I_014]], -1
-; CHECK-NEXT: [[SUB2:%.*]] = add i32 [[TMP36]], [[N]]
+; CHECK-NEXT: [[I_014:%.*]] = phi i32 [ [[BC_RESUME_VAL6]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[TMP40:%.*]] = xor i32 [[I_014]], -1
+; CHECK-NEXT: [[SUB2:%.*]] = add i32 [[TMP40]], [[N]]
; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[SUB2]] to i64
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[IDXPROM]]
-; CHECK-NEXT: [[TMP37:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[CONV3:%.*]] = fadd fast float [[TMP37]], 1.000000e+00
+; CHECK-NEXT: [[TMP41:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; CHECK-NEXT: [[CONV3:%.*]] = fadd fast float [[TMP41]], 1.000000e+00
; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
; CHECK-NEXT: store float [[CONV3]], float* [[ARRAYIDX5]], align 4
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[N_VEC3:%.*]] = sub i64 [[N]], [[N_MOD_VF2]]
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; CHECK: vec.epilog.vector.body:
-; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[INDEX4:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT5:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX4]], 0
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[TMP5]]
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TMP6]], i32 0
; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <4 x i8>*
; CHECK-NEXT: store <4 x i8> <i8 1, i8 1, i8 1, i8 1>, <4 x i8>* [[TMP8]], align 1
-; CHECK-NEXT: [[INDEX_NEXT6]] = add nuw i64 [[OFFSET_IDX]], 4
-; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC3]]
+; CHECK-NEXT: [[INDEX_NEXT5]] = add nuw i64 [[INDEX4]], 4
+; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT5]], [[N_VEC3]]
; CHECK-NEXT: br i1 [[TMP9]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
; CHECK: vec.epilog.middle.block:
-; CHECK-NEXT: [[CMP_N4:%.*]] = icmp eq i64 [[N]], [[N_VEC3]]
-; CHECK-NEXT: br i1 [[CMP_N4]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK-NEXT: [[CMP_N6:%.*]] = icmp eq i64 [[N]], [[N_VEC3]]
+; CHECK-NEXT: br i1 [[CMP_N6]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
; CHECK: vec.epilog.scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[N_VEC3:%.*]] = sub i64 [[N]], [[N_MOD_VF2]]
; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
; CHECK-PROFITABLE-BY-DEFAULT: vec.epilog.vector.body:
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[INDEX4:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT5:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP5:%.*]] = add i64 [[INDEX4]], 0
; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[TMP5]]
; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[TMP6]], i32 0
; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <2 x i8>*
; CHECK-PROFITABLE-BY-DEFAULT-NEXT: store <2 x i8> <i8 1, i8 1>, <2 x i8>* [[TMP8]], align 1
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[INDEX_NEXT6]] = add nuw i64 [[OFFSET_IDX]], 2
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC3]]
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[INDEX_NEXT5]] = add nuw i64 [[INDEX4]], 2
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT5]], [[N_VEC3]]
; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br i1 [[TMP9]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
; CHECK-PROFITABLE-BY-DEFAULT: vec.epilog.middle.block:
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[CMP_N4:%.*]] = icmp eq i64 [[N]], [[N_VEC3]]
-; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br i1 [[CMP_N4]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[CMP_N6:%.*]] = icmp eq i64 [[N]], [[N_VEC3]]
+; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br i1 [[CMP_N6]], label [[FOR_END_LOOPEXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
; CHECK-PROFITABLE-BY-DEFAULT: vec.epilog.scalar.ph:
; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br label [[FOR_BODY:%.*]]
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <4 x i64> [[TMP8]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK: loop.1.latch:
; CHECK-NEXT: [[IV_1_NEXT]] = add nuw nsw i64 [[IV_1]], 1
; CHECK-NEXT: [[EC_1:%.*]] = icmp eq i64 [[IV_1_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EC_1]], label [[EXIT]], label [[LOOP_1_HEADER]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT: br i1 [[EC_1]], label [[EXIT]], label [[LOOP_1_HEADER]], [[LOOP2:!llvm.loop !.*]]
; CHECK: exit:
; CHECK-NEXT: ret void
;
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8*, i8** [[START_1]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, i8* [[POINTER_PHI]], <4 x i64> <i64 0, i64 1, i64 2, i64 3>
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, <4 x i8*> [[TMP1]], i64 1
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8*, i8** [[NEXT_GEP]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to <4 x i8*>*
-; CHECK-NEXT: store <4 x i8*> [[TMP2]], <4 x i8*>* [[TMP4]], align 8
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i8*> [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[TMP5]], i32 0
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP7]], align 1
-; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1>
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>*
-; CHECK-NEXT: store <4 x i8> [[TMP8]], <4 x i8>* [[TMP9]], align 1
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, i8* [[POINTER_PHI]], <4 x i64> <i64 0, i64 1, i64 2, i64 3>
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, <4 x i8*> [[TMP2]], i64 1
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8*, i8** [[NEXT_GEP]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8** [[TMP4]] to <4 x i8*>*
+; CHECK-NEXT: store <4 x i8*> [[TMP3]], <4 x i8*>* [[TMP5]], align 8
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i8*> [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, i8* [[TMP6]], i32 0
+; CHECK-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <4 x i8>*
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP8]], align 1
+; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP7]] to <4 x i8>*
+; CHECK-NEXT: store <4 x i8> [[TMP9]], <4 x i8>* [[TMP10]], align 1
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, i8* [[POINTER_PHI]], i64 4
-; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-NEXT: [[MUL:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[TMP4]], i8 [[TMP5]])
; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i8, i1 } [[MUL]], 0
; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i8, i1 } [[MUL]], 1
-; CHECK-NEXT: [[TMP6:%.*]] = sub i8 0, [[MUL_RESULT]]
-; CHECK-NEXT: [[TMP7:%.*]] = icmp slt i8 [[MUL_RESULT]], 0
-; CHECK-NEXT: [[TMP8:%.*]] = icmp sgt i8 [[TMP6]], 0
-; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP3]], i1 [[TMP8]], i1 [[TMP7]]
-; CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP9]], [[MUL_OVERFLOW]]
+; CHECK-NEXT: [[TMP7:%.*]] = sub i8 0, [[MUL_RESULT]]
+; CHECK-NEXT: [[TMP8:%.*]] = icmp slt i8 [[MUL_RESULT]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = icmp sgt i8 [[TMP7]], 0
+; CHECK-NEXT: [[TMP10:%.*]] = select i1 [[TMP3]], i1 [[TMP9]], i1 [[TMP8]]
+; CHECK-NEXT: [[TMP14:%.*]] = or i1 [[TMP10]], [[MUL_OVERFLOW]]
; CHECK-NEXT: [[TMP11:%.*]] = icmp ugt i64 [[TMP0]], 255
; CHECK-NEXT: [[TMP12:%.*]] = icmp ne i8 [[TMP1]], 0
; CHECK-NEXT: [[TMP13:%.*]] = and i1 [[TMP11]], [[TMP12]]
-; CHECK-NEXT: [[TMP14:%.*]] = or i1 [[TMP10]], [[TMP13]]
-; CHECK-NEXT: [[TMP15:%.*]] = sext i8 [[TMP1]] to i32
-; CHECK-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i32 [[STEP]], [[TMP15]]
-; CHECK-NEXT: [[TMP16:%.*]] = or i1 [[TMP14]], [[IDENT_CHECK]]
-; CHECK-NEXT: br i1 [[TMP16]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT: [[TMP15:%.*]] = or i1 [[TMP14]], [[TMP13]]
+; CHECK-NEXT: [[TMP17:%.*]] = sext i8 [[TMP1]] to i32
+; CHECK-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i32 [[STEP]], [[TMP17]]
+; CHECK-NEXT: [[TMP18:%.*]] = or i1 [[TMP15]], [[IDENT_CHECK]]
+; CHECK-NEXT: br i1 [[TMP18]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; CHECK-NEXT: [[IND_END:%.*]] = mul i32 [[CAST_CRD]], [[STEP]]
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[STEP]], i32 0
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP17:%.*]] = mul <4 x i32> <i32 0, i32 1, i32 2, i32 3>, [[DOTSPLAT]]
-; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> zeroinitializer, [[TMP17]]
-; CHECK-NEXT: [[TMP18:%.*]] = mul i32 [[STEP]], 4
-; CHECK-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP18]], i32 0
+; CHECK-NEXT: [[TMP19:%.*]] = mul <4 x i32> <i32 0, i32 1, i32 2, i32 3>, [[DOTSPLAT]]
+; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> zeroinitializer, [[TMP19]]
+; CHECK-NEXT: [[TMP20:%.*]] = mul i32 [[STEP]], 4
+; CHECK-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP20]], i32 0
; CHECK-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT2]], <4 x i32> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 [[TMP19]]
-; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 0
-; CHECK-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
-; CHECK-NEXT: store <4 x i32> [[VEC_IND]], <4 x i32>* [[TMP22]], align 4
+; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 [[TMP21]]
+; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[TMP22]], i32 0
+; CHECK-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; CHECK-NEXT: store <4 x i32> [[VEC_IND]], <4 x i32>* [[TMP24]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], [[DOTSPLAT3]]
-; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; CHECK-NEXT: [[MUL:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[TMP4]], i8 [[TMP5]])
; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i8, i1 } [[MUL]], 0
; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i8, i1 } [[MUL]], 1
-; CHECK-NEXT: [[TMP6:%.*]] = sub i8 0, [[MUL_RESULT]]
-; CHECK-NEXT: [[TMP7:%.*]] = icmp ugt i8 [[TMP6]], 0
-; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[TMP3]], i1 [[TMP7]], i1 false
-; CHECK-NEXT: [[TMP9:%.*]] = or i1 [[TMP8]], [[MUL_OVERFLOW]]
-; CHECK-NEXT: [[TMP10:%.*]] = icmp ugt i64 [[TMP0]], 255
-; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i8 [[TMP1]], 0
-; CHECK-NEXT: [[TMP12:%.*]] = and i1 [[TMP10]], [[TMP11]]
-; CHECK-NEXT: [[TMP13:%.*]] = or i1 [[TMP9]], [[TMP12]]
-; CHECK-NEXT: [[TMP14:%.*]] = sext i8 [[TMP1]] to i32
-; CHECK-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i32 [[STEP]], [[TMP14]]
-; CHECK-NEXT: [[TMP15:%.*]] = or i1 [[TMP13]], [[IDENT_CHECK]]
-; CHECK-NEXT: br i1 [[TMP15]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT: [[TMP7:%.*]] = sub i8 0, [[MUL_RESULT]]
+; CHECK-NEXT: [[TMP9:%.*]] = icmp ugt i8 [[TMP7]], 0
+; CHECK-NEXT: [[TMP10:%.*]] = select i1 [[TMP3]], i1 [[TMP9]], i1 false
+; CHECK-NEXT: [[TMP14:%.*]] = or i1 [[TMP10]], [[MUL_OVERFLOW]]
+; CHECK-NEXT: [[TMP11:%.*]] = icmp ugt i64 [[TMP0]], 255
+; CHECK-NEXT: [[TMP12:%.*]] = icmp ne i8 [[TMP1]], 0
+; CHECK-NEXT: [[TMP13:%.*]] = and i1 [[TMP11]], [[TMP12]]
+; CHECK-NEXT: [[TMP15:%.*]] = or i1 [[TMP14]], [[TMP13]]
+; CHECK-NEXT: [[TMP17:%.*]] = sext i8 [[TMP1]] to i32
+; CHECK-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i32 [[STEP]], [[TMP17]]
+; CHECK-NEXT: [[TMP18:%.*]] = or i1 [[TMP15]], [[IDENT_CHECK]]
+; CHECK-NEXT: br i1 [[TMP18]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; CHECK-NEXT: [[IND_END:%.*]] = mul i32 [[CAST_CRD]], [[STEP]]
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[STEP]], i32 0
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP16:%.*]] = mul <4 x i32> <i32 0, i32 1, i32 2, i32 3>, [[DOTSPLAT]]
-; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> zeroinitializer, [[TMP16]]
-; CHECK-NEXT: [[TMP17:%.*]] = mul i32 [[STEP]], 4
-; CHECK-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP17]], i32 0
+; CHECK-NEXT: [[TMP19:%.*]] = mul <4 x i32> <i32 0, i32 1, i32 2, i32 3>, [[DOTSPLAT]]
+; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> zeroinitializer, [[TMP19]]
+; CHECK-NEXT: [[TMP20:%.*]] = mul i32 [[STEP]], 4
+; CHECK-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP20]], i32 0
; CHECK-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT2]], <4 x i32> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP18:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 [[TMP18]]
-; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP19]], i32 0
-; CHECK-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <4 x i32>*
-; CHECK-NEXT: store <4 x i32> [[VEC_IND]], <4 x i32>* [[TMP21]], align 4
+; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 [[TMP21]]
+; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[TMP22]], i32 0
+; CHECK-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <4 x i32>*
+; CHECK-NEXT: store <4 x i32> [[VEC_IND]], <4 x i32>* [[TMP24]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], [[DOTSPLAT3]]
-; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; CHECK-NEXT: [[MUL:%.*]] = call { i8, i1 } @llvm.umul.with.overflow.i8(i8 [[TMP3]], i8 [[TMP4]])
; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i8, i1 } [[MUL]], 0
; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i8, i1 } [[MUL]], 1
-; CHECK-NEXT: [[TMP5:%.*]] = sub i8 0, [[MUL_RESULT]]
-; CHECK-NEXT: [[TMP6:%.*]] = icmp slt i8 [[MUL_RESULT]], 0
-; CHECK-NEXT: [[TMP7:%.*]] = icmp sgt i8 [[TMP5]], 0
-; CHECK-NEXT: [[TMP8:%.*]] = select i1 [[TMP2]], i1 [[TMP7]], i1 [[TMP6]]
-; CHECK-NEXT: [[TMP9:%.*]] = or i1 [[TMP8]], [[MUL_OVERFLOW]]
+; CHECK-NEXT: [[TMP6:%.*]] = sub i8 0, [[MUL_RESULT]]
+; CHECK-NEXT: [[TMP7:%.*]] = icmp slt i8 [[MUL_RESULT]], 0
+; CHECK-NEXT: [[TMP8:%.*]] = icmp sgt i8 [[TMP6]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = select i1 [[TMP2]], i1 [[TMP8]], i1 [[TMP7]]
+; CHECK-NEXT: [[TMP13:%.*]] = or i1 [[TMP9]], [[MUL_OVERFLOW]]
; CHECK-NEXT: [[TMP10:%.*]] = icmp ugt i64 [[TMP0]], 255
; CHECK-NEXT: [[TMP11:%.*]] = icmp ne i8 [[CSTEP]], 0
; CHECK-NEXT: [[TMP12:%.*]] = and i1 [[TMP10]], [[TMP11]]
-; CHECK-NEXT: [[TMP13:%.*]] = or i1 [[TMP9]], [[TMP12]]
-; CHECK-NEXT: br i1 [[TMP13]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT: [[TMP14:%.*]] = or i1 [[TMP13]], [[TMP12]]
+; CHECK-NEXT: br i1 [[TMP14]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
; CHECK-NEXT: [[IND_END:%.*]] = mul i32 [[CAST_CRD]], [[CONV]]
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[CONV]], i32 0
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP14:%.*]] = mul <4 x i32> <i32 0, i32 1, i32 2, i32 3>, [[DOTSPLAT]]
-; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> zeroinitializer, [[TMP14]]
-; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[CONV]], 4
-; CHECK-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP15]], i32 0
+; CHECK-NEXT: [[TMP16:%.*]] = mul <4 x i32> <i32 0, i32 1, i32 2, i32 3>, [[DOTSPLAT]]
+; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> zeroinitializer, [[TMP16]]
+; CHECK-NEXT: [[TMP17:%.*]] = mul i32 [[CONV]], 4
+; CHECK-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP17]], i32 0
; CHECK-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT2]], <4 x i32> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 [[TMP16]]
-; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[TMP17]], i32 0
-; CHECK-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <4 x i32>*
-; CHECK-NEXT: store <4 x i32> [[VEC_IND]], <4 x i32>* [[TMP19]], align 4
+; CHECK-NEXT: [[TMP18:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds [250 x i32], [250 x i32]* @a, i64 0, i64 [[TMP18]]
+; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP19]], i32 0
+; CHECK-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <4 x i32>*
+; CHECK-NEXT: store <4 x i32> [[VEC_IND]], <4 x i32>* [[TMP21]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], [[DOTSPLAT3]]
-; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -indvars < %s | FileCheck %s
; Produced from the test-case:
@theSize = external local_unnamed_addr global i32, align 4
define void @foo(i8* %buf, i32 %denominator, i32* %flag) local_unnamed_addr {
-; CHECK-LABEL: @foo(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[I:%.*]] = alloca i32, align 4
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* @theSize, align 4
-; CHECK-NEXT: [[DIV:%.*]] = udiv i32 [[TMP0]], [[DENOMINATOR:%.*]]
-; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[FLAG:%.*]], align 4
-; CHECK-NEXT: [[TOBOOL5:%.*]] = icmp eq i32 [[TMP1]], 0
-; CHECK-NEXT: br i1 [[TOBOOL5]], label [[WHILE_END:%.*]], label [[WHILE_BODY_LR_PH:%.*]]
-; CHECK: while.body.lr.ph:
-; CHECK-NEXT: br label [[WHILE_BODY:%.*]]
-; CHECK: while.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_LR_PH]] ]
-; CHECK-NEXT: [[BUF_ADDR_07:%.*]] = phi i8* [ [[BUF:%.*]], [[WHILE_BODY_LR_PH]] ], [ [[CALL:%.*]], [[WHILE_BODY]] ]
-; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[DIV]] to i64
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], [[TMP2]]
-; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* @theSize, align 4
-; CHECK-NEXT: store i32 [[TMP3]], i32* [[I]], align 4
-; CHECK-NEXT: call void @bar(i32* nonnull [[I]], i64 [[INDVARS_IV_NEXT]])
-; CHECK-NEXT: [[CALL]] = call i8* @processBuf(i8* [[BUF_ADDR_07]])
-; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[FLAG]], align 4
-; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[TMP4]], 0
-; CHECK-NEXT: br i1 [[TOBOOL]], label [[WHILE_END_LOOPEXIT:%.*]], label [[WHILE_BODY]]
-; CHECK: while.end.loopexit:
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[I]] to i8*
-; CHECK-NEXT: br label [[WHILE_END]]
-; CHECK: while.end:
-; CHECK-NEXT: ret void
-;
entry:
%i = alloca i32, align 4
%0 = load i32, i32* @theSize, align 4
while.body: ; preds = %while.body.lr.ph, %while.body
; Check that there are two PHIs followed by a 'sext' in the same block, and that
; the test does not crash.
+; CHECK: phi
+; CHECK-NEXT: phi
+; CHECK-NEXT: sext
%buf.addr.07 = phi i8* [ %buf, %while.body.lr.ph ], [ %call, %while.body ]
%inx.06 = phi i32 [ 0, %while.body.lr.ph ], [ %add, %while.body ]
%add = add nsw i32 %inx.06, %div
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -loop-vectorize -S < %s 2>&1 | FileCheck %s
; Produced from test-case:
target triple = "x86_64-unknown-linux-gnu"
define void @testGuardedInnerLoop(i32* %ptr, i32 %denom, i32 %numer, i32 %outer_lim) {
-; CHECK-LABEL: @testGuardedInnerLoop(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[OUTER_LIM:%.*]], 0
-; CHECK-NEXT: br i1 [[CMP1]], label [[EXIT:%.*]], label [[LOOP1_PREHEADER:%.*]]
-; CHECK: loop1.preheader:
-; CHECK-NEXT: br label [[LOOP1:%.*]]
-; CHECK: loop1:
-; CHECK-NEXT: [[OUTER_I:%.*]] = phi i32 [ [[INC1:%.*]], [[LOOP2_EXIT:%.*]] ], [ 0, [[LOOP1_PREHEADER]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[DENOM:%.*]], -1
-; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i32 [[TMP0]], [[NUMER:%.*]]
-; CHECK-NEXT: br i1 [[TMP1]], label [[LOOP2_PREHEADER:%.*]], label [[LOOP2_EXIT]]
-; CHECK: loop2.preheader:
-; CHECK-NEXT: [[LIM:%.*]] = udiv i32 [[NUMER]], [[DENOM]]
-; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[LIM]] to i64
-; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP2]], i64 1)
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 8
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], 8
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i64 [[TMP3]]
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
-; CHECK-NEXT: store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32>* [[TMP8]], align 4
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 4
-; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
-; CHECK-NEXT: store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32>* [[TMP10]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP2_EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP2_PREHEADER]] ]
-; CHECK-NEXT: br label [[LOOP2:%.*]]
-; CHECK: loop2:
-; CHECK-NEXT: [[INDVAR_LOOP2:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVAR_LOOP2_NEXT:%.*]], [[LOOP2]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[INDVAR_LOOP2]]
-; CHECK-NEXT: store i32 1, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[INDVAR_LOOP2_NEXT]] = add nuw nsw i64 [[INDVAR_LOOP2]], 1
-; CHECK-NEXT: [[CMP2:%.*]] = icmp ult i64 [[INDVAR_LOOP2_NEXT]], [[TMP2]]
-; CHECK-NEXT: br i1 [[CMP2]], label [[LOOP2]], label [[LOOP2_EXIT_LOOPEXIT]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: loop2.exit.loopexit:
-; CHECK-NEXT: br label [[LOOP2_EXIT]]
-; CHECK: loop2.exit:
-; CHECK-NEXT: [[INC1]] = add nuw i32 [[OUTER_I]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC1]], [[OUTER_LIM]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT_LOOPEXIT:%.*]], label [[LOOP1]]
-; CHECK: exit.loopexit:
-; CHECK-NEXT: br label [[EXIT]]
-; CHECK: exit:
-; CHECK-NEXT: ret void
-;
entry:
%cmp1 = icmp eq i32 %outer_lim, 0
br i1 %cmp1, label %exit, label %loop1.preheader
; Verify that a 'udiv' does not appear between the 'loop1.preheader' label, and
; whatever label comes next.
loop1.preheader:
+; CHECK-LABEL: loop1.preheader:
+; CHECK-NOT: udiv
+; CHECK-LABEL: :
br label %loop1
loop1:
; Verify that a 'udiv' does appear between the 'loop2.preheader' label, and
; whatever label comes next.
loop2.preheader:
+; CHECK-LABEL: loop2.preheader:
+; CHECK: udiv
+; CHECK-LABEL: :
%lim = udiv i32 %numer, %denom
%2 = zext i32 %lim to i64
br label %loop2
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -loop-vectorize -S | FileCheck %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
; This cannot be correctly vectorized with type i1.
define i8 @test_01(i8 %c) #0 {
+
; CHECK-LABEL: @test_01(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[LOOP:%.*]]
-; CHECK: exit:
-; CHECK-NEXT: [[ACCUM_PLUS_LCSSA:%.*]] = phi i8 [ [[ACCUM_PLUS:%.*]], [[LOOP]] ]
-; CHECK-NEXT: ret i8 [[ACCUM_PLUS_LCSSA]]
-; CHECK: loop:
-; CHECK-NEXT: [[ACCUM_PHI:%.*]] = phi i8 [ [[C:%.*]], [[ENTRY:%.*]] ], [ [[ACCUM_PLUS]], [[LOOP]] ]
-; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[ACCUM_AND:%.*]] = and i8 [[ACCUM_PHI]], 1
-; CHECK-NEXT: [[ACCUM_PLUS]] = add nuw nsw i8 [[ACCUM_AND]], 3
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
-; CHECK-NEXT: [[COND:%.*]] = icmp ugt i32 [[IV]], 191
-; CHECK-NEXT: br i1 [[COND]], label [[EXIT:%.*]], label [[LOOP]]
-;
+; CHECK-NOT: vector.body:
+; CHECK-NOT: zext i1 {{.*}} to i8
entry:
br label %loop
; TODO: This can be vectorized with type i1 because the result is not used.
define void @test_02(i8 %c) #0 {
+
; CHECK-LABEL: @test_02(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[LOOP:%.*]]
-; CHECK: exit:
-; CHECK-NEXT: [[LCSSA:%.*]] = phi i8 [ [[ACCUM_PLUS:%.*]], [[LOOP]] ]
-; CHECK-NEXT: ret void
-; CHECK: loop:
-; CHECK-NEXT: [[ACCUM_PHI:%.*]] = phi i8 [ [[C:%.*]], [[ENTRY:%.*]] ], [ [[ACCUM_PLUS]], [[LOOP]] ]
-; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[ACCUM_AND:%.*]] = and i8 [[ACCUM_PHI]], 1
-; CHECK-NEXT: [[ACCUM_PLUS]] = add nuw nsw i8 [[ACCUM_AND]], 3
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
-; CHECK-NEXT: [[COND:%.*]] = icmp ugt i32 [[IV]], 191
-; CHECK-NEXT: br i1 [[COND]], label [[EXIT:%.*]], label [[LOOP]]
-;
+; CHECK-NOT: vector.body:
entry:
br label %loop
; This can be vectorized with type i1 because the result is truncated properly.
define i1 @test_03(i8 %c) #0 {
+
; CHECK-LABEL: @test_03(
-; CHECK-NEXT: iter.check:
-; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; CHECK: vector.main.loop.iter.check:
-; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP0:%.*]] = insertelement <16 x i8> zeroinitializer, i8 [[C:%.*]], i32 0
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <16 x i8> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i8> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP1:%.*]] = and <16 x i8> [[VEC_PHI]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-; CHECK-NEXT: [[TMP2:%.*]] = and <16 x i8> [[VEC_PHI1]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-; CHECK-NEXT: [[TMP3:%.*]] = add <16 x i8> [[TMP1]], <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-; CHECK-NEXT: [[TMP4:%.*]] = add <16 x i8> [[TMP2]], <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32
-; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 192
-; CHECK-NEXT: [[TMP6:%.*]] = trunc <16 x i8> [[TMP3]] to <16 x i1>
-; CHECK-NEXT: [[TMP7]] = zext <16 x i1> [[TMP6]] to <16 x i8>
-; CHECK-NEXT: [[TMP8:%.*]] = trunc <16 x i8> [[TMP4]] to <16 x i1>
-; CHECK-NEXT: [[TMP9]] = zext <16 x i1> [[TMP8]] to <16 x i8>
-; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[TMP10:%.*]] = trunc <16 x i8> [[TMP7]] to <16 x i1>
-; CHECK-NEXT: [[TMP11:%.*]] = trunc <16 x i8> [[TMP9]] to <16 x i1>
-; CHECK-NEXT: [[BIN_RDX:%.*]] = add <16 x i1> [[TMP11]], [[TMP10]]
-; CHECK-NEXT: [[TMP12:%.*]] = call i1 @llvm.vector.reduce.add.v16i1(<16 x i1> [[BIN_RDX]])
-; CHECK-NEXT: [[TMP13:%.*]] = zext i1 [[TMP12]] to i8
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 192, 192
-; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
-; CHECK: vec.epilog.iter.check:
-; CHECK-NEXT: br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
-; CHECK: vec.epilog.ph:
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ [[C]], [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP13]], [[VEC_EPILOG_ITER_CHECK]] ]
-; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ 192, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x i8> zeroinitializer, i8 [[BC_MERGE_RDX]], i32 0
-; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
-; CHECK: vec.epilog.vector.body:
-; CHECK-NEXT: [[INDEX3:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT5:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI4:%.*]] = phi <8 x i8> [ [[TMP14]], [[VEC_EPILOG_PH]] ], [ [[TMP19:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP15:%.*]] = and <8 x i8> [[VEC_PHI4]], <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-; CHECK-NEXT: [[TMP16:%.*]] = add <8 x i8> [[TMP15]], <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
-; CHECK-NEXT: [[INDEX_NEXT5]] = add nuw i32 [[INDEX3]], 8
-; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT5]], 192
-; CHECK-NEXT: [[TMP18:%.*]] = trunc <8 x i8> [[TMP16]] to <8 x i1>
-; CHECK-NEXT: [[TMP19]] = zext <8 x i1> [[TMP18]] to <8 x i8>
-; CHECK-NEXT: br i1 [[TMP17]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: vec.epilog.middle.block:
-; CHECK-NEXT: [[TMP20:%.*]] = trunc <8 x i8> [[TMP19]] to <8 x i1>
-; CHECK-NEXT: [[TMP21:%.*]] = call i1 @llvm.vector.reduce.add.v8i1(<8 x i1> [[TMP20]])
-; CHECK-NEXT: [[TMP22:%.*]] = zext i1 [[TMP21]] to i8
-; CHECK-NEXT: [[CMP_N2:%.*]] = icmp eq i32 192, 192
-; CHECK-NEXT: br i1 [[CMP_N2]], label [[EXIT_LOOPEXIT:%.*]], label [[VEC_EPILOG_SCALAR_PH]]
-; CHECK: vec.epilog.scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 193, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 193, [[VEC_EPILOG_ITER_CHECK]] ], [ 1, [[ITER_CHECK:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX6:%.*]] = phi i8 [ [[C]], [[ITER_CHECK]] ], [ [[TMP13]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP22]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[LOOP:%.*]]
-; CHECK: exit.loopexit:
-; CHECK-NEXT: [[ACCUM_PLUS_LCSSA:%.*]] = phi i8 [ [[ACCUM_PLUS:%.*]], [[LOOP]] ], [ [[TMP22]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[EXIT]]
-; CHECK: exit:
-; CHECK-NEXT: [[LCSSA:%.*]] = phi i8 [ [[TMP13]], [[MIDDLE_BLOCK]] ], [ [[ACCUM_PLUS_LCSSA]], [[EXIT_LOOPEXIT]] ]
-; CHECK-NEXT: [[TRUNC:%.*]] = trunc i8 [[LCSSA]] to i1
-; CHECK-NEXT: ret i1 [[TRUNC]]
-; CHECK: loop:
-; CHECK-NEXT: [[ACCUM_PHI:%.*]] = phi i8 [ [[BC_MERGE_RDX6]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ACCUM_PLUS]], [[LOOP]] ]
-; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[ACCUM_AND:%.*]] = and i8 [[ACCUM_PHI]], 1
-; CHECK-NEXT: [[ACCUM_PLUS]] = add nuw nsw i8 [[ACCUM_AND]], 3
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
-; CHECK-NEXT: [[COND:%.*]] = icmp ugt i32 [[IV]], 191
-; CHECK-NEXT: br i1 [[COND]], label [[EXIT_LOOPEXIT]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
-;
+; CHECK: vector.body:
+; CHECK: zext i1 {{.*}} to i8
entry:
br label %loop
; wrong type.
; TODO: It can also be vectorized with type i32 (or maybe i4?)
define i4 @test_04(i8 %c) #0 {
+
; CHECK-LABEL: @test_04(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[LOOP:%.*]]
-; CHECK: exit:
-; CHECK-NEXT: [[LCSSA:%.*]] = phi i8 [ [[ACCUM_PLUS:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[TRUNC:%.*]] = trunc i8 [[LCSSA]] to i4
-; CHECK-NEXT: ret i4 [[TRUNC]]
-; CHECK: loop:
-; CHECK-NEXT: [[ACCUM_PHI:%.*]] = phi i8 [ [[C:%.*]], [[ENTRY:%.*]] ], [ [[ACCUM_PLUS]], [[LOOP]] ]
-; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[ACCUM_AND:%.*]] = and i8 [[ACCUM_PHI]], 1
-; CHECK-NEXT: [[ACCUM_PLUS]] = add nuw nsw i8 [[ACCUM_AND]], 3
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
-; CHECK-NEXT: [[COND:%.*]] = icmp ugt i32 [[IV]], 191
-; CHECK-NEXT: br i1 [[COND]], label [[EXIT:%.*]], label [[LOOP]]
-;
+; CHECK-NOT: vector.body:
+; CHECK-NOT: zext i1 {{.*}} to i8
entry:
br label %loop
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s 2>&1 | FileCheck %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
define void @doit1(i32* %ptr) {
; CHECK-LABEL: @doit1(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i8> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 9, i32 18, i32 27>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND2:%.*]] = phi <4 x i8> [ <i8 0, i8 9, i8 18, i8 27>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT3:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1]] = add <4 x i8> [[VEC_PHI]], [[VEC_IND2]]
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[PTR:%.*]], i32 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
-; CHECK-NEXT: store <4 x i32> [[VEC_IND]], <4 x i32>* [[TMP4]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 36, i32 36, i32 36, i32 36>
-; CHECK-NEXT: [[VEC_IND_NEXT3]] = add <4 x i8> [[VEC_IND2]], <i8 36, i8 36, i8 36, i8 36>
-; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
-; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[TMP6:%.*]] = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> [[TMP1]])
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 16, 16
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 144, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ 0, [[ENTRY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[MAIN_IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[I8_IV:%.*]] = phi i8 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[I8_ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[I32_IV:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[I32_ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TRUNC_TO_BE_CONVERTED_TO_NEW_IV:%.*]] = trunc i32 [[I32_IV]] to i8
-; CHECK-NEXT: [[I8_ADD]] = add i8 [[I8_IV]], [[TRUNC_TO_BE_CONVERTED_TO_NEW_IV]]
-; CHECK-NEXT: [[PTR_GEP:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i32 [[MAIN_IV]]
-; CHECK-NEXT: store i32 [[I32_IV]], i32* [[PTR_GEP]], align 4
-; CHECK-NEXT: [[NOOP_CONV_UNDER_PSE:%.*]] = and i32 [[I32_IV]], 255
-; CHECK-NEXT: [[I32_ADD]] = add nuw nsw i32 [[NOOP_CONV_UNDER_PSE]], 9
-; CHECK-NEXT: [[INC]] = add i32 [[MAIN_IV]], 1
-; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[INC]], 16
-; CHECK-NEXT: br i1 [[TOBOOL]], label [[FOR_COND_FOR_END_CRIT_EDGE]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: for.cond.for.end_crit_edge:
-; CHECK-NEXT: [[I8_ADD_LCSSA:%.*]] = phi i8 [ [[I8_ADD]], [[FOR_BODY]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: store i8 [[I8_ADD_LCSSA]], i8* @b, align 1
-; CHECK-NEXT: br label [[FOR_END:%.*]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK-NEXT: [[MAIN_IV:%.*]] = phi i32 [ 0, [[VECTOR_PH:%.*]] ], [ [[MAIN_IV_NEXT:%.*]], [[VECTOR_BODY:%.*]] ]
+; CHECK-NEXT: [[I8_IV:%.*]] = phi <4 x i8> [ zeroinitializer, [[VECTOR_PH]] ], [ [[I8_IV_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[I32_IV:%.*]] = phi <4 x i32> [ <i32 0, i32 9, i32 18, i32 27>, [[VECTOR_PH]] ], [ [[I32_IV_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[IV_FROM_TRUNC:%.*]] = phi <4 x i8> [ <i8 0, i8 9, i8 18, i8 27>, [[VECTOR_PH]] ], [ [[IV_FROM_TRUNC_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[MAIN_IV]], 0
+; CHECK-NEXT: [[I8_IV_NEXT]] = add <4 x i8> [[I8_IV]], [[IV_FROM_TRUNC]]
+; CHECK-NEXT: [[GEP1:%.+]] = getelementptr inbounds i32, i32* %ptr, i32 [[TMP7]]
+; CHECK-NEXT: [[GEP2:%.+]] = getelementptr inbounds i32, i32* [[GEP1]], i32 0
+; CHECK-NEXT: [[GEP_BC:%.+]] = bitcast i32* [[GEP2]] to <4 x i32>*
+; CHECK-NEXT: store <4 x i32> [[I32_IV]], <4 x i32>* [[GEP_BC]], align 4
+; CHECK-NEXT: [[MAIN_IV_NEXT]] = add nuw i32 [[MAIN_IV]], 4
+; CHECK-NEXT: [[I32_IV_NEXT]] = add <4 x i32> [[I32_IV]], <i32 36, i32 36, i32 36, i32 36>
+; CHECK-NEXT: [[IV_FROM_TRUNC_NEXT]] = add <4 x i8> [[IV_FROM_TRUNC]], <i8 36, i8 36, i8 36, i8 36>
+; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[MAIN_IV_NEXT]], 16
+; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+;
entry:
br label %for.body
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -passes='loop-vectorize' -S -pass-remarks-missed=loop-vectorize < %s 2>&1 | FileCheck %s
;
; FP primary induction is not supported in LV. Make sure Legal bails out.
; CHECK: loop not vectorized
define void @PR37515() {
-; CHECK-LABEL: @PR37515(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[LOOP:%.*]]
-; CHECK: loop:
-; CHECK-NEXT: [[P:%.*]] = phi float [ 1.900000e+01, [[ENTRY:%.*]] ], [ [[A:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[A]] = fadd fast float [[P]], -1.000000e+00
-; CHECK-NEXT: [[M:%.*]] = fmul fast float [[A]], [[A]]
-; CHECK-NEXT: [[C:%.*]] = fcmp fast ugt float [[A]], 2.000000e+00
-; CHECK-NEXT: br i1 [[C]], label [[LOOP]], label [[EXIT:%.*]]
-; CHECK: exit:
-; CHECK-NEXT: unreachable
-;
entry:
br label %loop
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -loop-vectorize -force-vector-width=2 -S < %s 2>&1 | FileCheck %s
; RUN: opt -indvars -S < %s 2>&1 | FileCheck %s -check-prefix=INDVARCHECK
define void @testCountIncrLoop(i8* %ptr, i32 %lim, i32 %count, i32 %val) mustprogress {
; CHECK-LABEL: @testCountIncrLoop(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[LIM:%.*]], 0
-; CHECK-NEXT: br i1 [[CMP1]], label [[LOOP1_PREHEADER:%.*]], label [[EXIT:%.*]]
; CHECK: loop1.preheader:
-; CHECK-NEXT: [[CMP2:%.*]] = icmp sgt i32 [[COUNT:%.*]], 0
-; CHECK-NEXT: [[CMP4:%.*]] = icmp slt i32 [[COUNT]], 8
-; CHECK-NEXT: [[TMP0:%.*]] = shl i32 [[COUNT]], 1
-; CHECK-NEXT: [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP0]], i32 8)
-; CHECK-NEXT: [[TMP1:%.*]] = sub i32 [[SMAX]], [[TMP0]]
-; CHECK-NEXT: [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP1]], i32 1)
-; CHECK-NEXT: [[TMP2:%.*]] = sub i32 [[SMAX]], [[UMIN]]
-; CHECK-NEXT: [[TMP3:%.*]] = sub i32 [[TMP2]], [[TMP0]]
-; CHECK-NEXT: br label [[LOOP1_BODY:%.*]]
+; CHECK-NOT: udiv
; CHECK: loop1.body:
-; CHECK-NEXT: [[OUTER_I:%.*]] = phi i32 [ 0, [[LOOP1_PREHEADER]] ], [ [[OUTER_I_1:%.*]], [[LOOP1_INC:%.*]] ]
-; CHECK-NEXT: [[INX_1:%.*]] = phi i32 [ 0, [[LOOP1_PREHEADER]] ], [ [[INX_2:%.*]], [[LOOP1_INC]] ]
-; CHECK-NEXT: br i1 [[CMP2]], label [[WHILE_COND_PREHEADER:%.*]], label [[LOOP1_INC]]
; CHECK: while.cond.preheader:
-; CHECK-NEXT: br i1 [[CMP4]], label [[WHILE_BODY_PREHEADER:%.*]], label [[WHILE_END:%.*]]
; CHECK: while.body.preheader:
-; CHECK-NEXT: [[TMP4:%.*]] = udiv i32 [[TMP3]], [[COUNT]]
-; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[UMIN]], [[TMP4]]
-; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], 1
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP6]], 4
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT: [[TMP1:%.*]] = udiv i32 [[TMP0:%.*]], [[COUNT:%.*]]
; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP6]], 4
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP6]], [[N_MOD_VF]]
-; CHECK-NEXT: [[TMP7:%.*]] = mul i32 [[N_VEC]], [[COUNT]]
-; CHECK-NEXT: [[IND_END:%.*]] = add i32 [[COUNT]], [[TMP7]]
-; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[COUNT]], i32 0
-; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[DOTSPLATINSERT1:%.*]] = insertelement <2 x i32> poison, i32 [[COUNT]], i32 0
-; CHECK-NEXT: [[DOTSPLAT2:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT1]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP8:%.*]] = mul <2 x i32> <i32 0, i32 1>, [[DOTSPLAT2]]
-; CHECK-NEXT: [[INDUCTION:%.*]] = add <2 x i32> [[DOTSPLAT]], [[TMP8]]
-; CHECK-NEXT: [[TMP9:%.*]] = mul i32 [[COUNT]], 2
-; CHECK-NEXT: [[DOTSPLATINSERT3:%.*]] = insertelement <2 x i32> poison, i32 [[TMP9]], i32 0
-; CHECK-NEXT: [[DOTSPLAT4:%.*]] = shufflevector <2 x i32> [[DOTSPLATINSERT3]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i32> zeroinitializer, i32 [[VAL:%.*]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[VAL]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <2 x i32> poison, i32 [[VAL]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT7]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ [[TMP10]], [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], [[DOTSPLAT4]]
-; CHECK-NEXT: [[TMP11:%.*]] = ashr <2 x i32> [[BROADCAST_SPLAT]], [[VEC_IND]]
-; CHECK-NEXT: [[TMP12:%.*]] = ashr <2 x i32> [[BROADCAST_SPLAT8]], [[STEP_ADD]]
-; CHECK-NEXT: [[TMP13]] = add <2 x i32> [[TMP11]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP14]] = add <2 x i32> [[TMP12]], [[VEC_PHI6]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], [[DOTSPLAT4]]
-; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[BIN_RDX:%.*]] = add <2 x i32> [[TMP14]], [[TMP13]]
-; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[BIN_RDX]])
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP6]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[COUNT]], [[WHILE_BODY_PREHEADER]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[VAL]], [[WHILE_BODY_PREHEADER]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[WHILE_BODY:%.*]]
-; CHECK: while.body:
-; CHECK-NEXT: [[TMP:%.*]] = phi i32 [ [[ADD3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[RESULT_1:%.*]] = phi i32 [ [[ADD:%.*]], [[WHILE_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[VAL]], [[TMP]]
-; CHECK-NEXT: [[ADD]] = add nsw i32 [[SHR]], [[RESULT_1]]
-; CHECK-NEXT: [[ADD3]] = add nsw i32 [[TMP]], [[COUNT]]
-; CHECK-NEXT: [[CMP3:%.*]] = icmp slt i32 [[ADD3]], 8
-; CHECK-NEXT: br i1 [[CMP3]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: while.end.loopexit:
-; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[WHILE_BODY]] ], [ [[TMP16]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[WHILE_END]]
-; CHECK: while.end:
-; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ [[VAL]], [[WHILE_COND_PREHEADER]] ], [ [[ADD_LCSSA]], [[WHILE_END_LOOPEXIT]] ]
-; CHECK-NEXT: [[CONV:%.*]] = trunc i32 [[RESULT_0_LCSSA]] to i8
-; CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[INX_1]], 1
-; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[INX_1]] to i64
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[IDXPROM]]
-; CHECK-NEXT: store i8 [[CONV]], i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT: br label [[LOOP1_INC]]
-; CHECK: loop1.inc:
-; CHECK-NEXT: [[INX_2]] = phi i32 [ [[INC]], [[WHILE_END]] ], [ [[INX_1]], [[LOOP1_BODY]] ]
-; CHECK-NEXT: [[OUTER_I_1]] = add nuw nsw i32 [[OUTER_I]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[OUTER_I_1]], [[LIM]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT_LOOPEXIT:%.*]], label [[LOOP1_BODY]]
-; CHECK: exit.loopexit:
-; CHECK-NEXT: br label [[EXIT]]
; CHECK: exit:
-; CHECK-NEXT: ret void
-;
-; INDVARCHECK-LABEL: @testCountIncrLoop(
-; INDVARCHECK-NEXT: entry:
-; INDVARCHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[LIM:%.*]], 0
-; INDVARCHECK-NEXT: br i1 [[CMP1]], label [[LOOP1_PREHEADER:%.*]], label [[EXIT:%.*]]
-; INDVARCHECK: loop1.preheader:
-; INDVARCHECK-NEXT: [[CMP2:%.*]] = icmp sgt i32 [[COUNT:%.*]], 0
-; INDVARCHECK-NEXT: [[CMP4:%.*]] = icmp slt i32 [[COUNT]], 8
-; INDVARCHECK-NEXT: br label [[LOOP1_BODY:%.*]]
-; INDVARCHECK: loop1.body:
-; INDVARCHECK-NEXT: [[OUTER_I:%.*]] = phi i32 [ 0, [[LOOP1_PREHEADER]] ], [ [[OUTER_I_1:%.*]], [[LOOP1_INC:%.*]] ]
-; INDVARCHECK-NEXT: [[INX_1:%.*]] = phi i32 [ 0, [[LOOP1_PREHEADER]] ], [ [[INX_2:%.*]], [[LOOP1_INC]] ]
-; INDVARCHECK-NEXT: br i1 [[CMP2]], label [[WHILE_COND_PREHEADER:%.*]], label [[LOOP1_INC]]
-; INDVARCHECK: while.cond.preheader:
-; INDVARCHECK-NEXT: br i1 [[CMP4]], label [[WHILE_BODY_PREHEADER:%.*]], label [[WHILE_END:%.*]]
-; INDVARCHECK: while.body.preheader:
-; INDVARCHECK-NEXT: br label [[WHILE_BODY:%.*]]
-; INDVARCHECK: while.body:
-; INDVARCHECK-NEXT: [[TMP:%.*]] = phi i32 [ [[ADD3:%.*]], [[WHILE_BODY]] ], [ [[COUNT]], [[WHILE_BODY_PREHEADER]] ]
-; INDVARCHECK-NEXT: [[RESULT_1:%.*]] = phi i32 [ [[ADD:%.*]], [[WHILE_BODY]] ], [ [[VAL:%.*]], [[WHILE_BODY_PREHEADER]] ]
-; INDVARCHECK-NEXT: [[SHR:%.*]] = ashr i32 [[VAL]], [[TMP]]
-; INDVARCHECK-NEXT: [[ADD]] = add nsw i32 [[SHR]], [[RESULT_1]]
-; INDVARCHECK-NEXT: [[ADD3]] = add nsw i32 [[TMP]], [[COUNT]]
-; INDVARCHECK-NEXT: [[CMP3:%.*]] = icmp slt i32 [[ADD3]], 8
-; INDVARCHECK-NEXT: br i1 [[CMP3]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT:%.*]]
-; INDVARCHECK: while.end.loopexit:
-; INDVARCHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[WHILE_BODY]] ]
-; INDVARCHECK-NEXT: br label [[WHILE_END]]
-; INDVARCHECK: while.end:
-; INDVARCHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ [[VAL]], [[WHILE_COND_PREHEADER]] ], [ [[ADD_LCSSA]], [[WHILE_END_LOOPEXIT]] ]
-; INDVARCHECK-NEXT: [[CONV:%.*]] = trunc i32 [[RESULT_0_LCSSA]] to i8
-; INDVARCHECK-NEXT: [[INC:%.*]] = add nsw i32 [[INX_1]], 1
-; INDVARCHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[INX_1]] to i64
-; INDVARCHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[IDXPROM]]
-; INDVARCHECK-NEXT: store i8 [[CONV]], i8* [[ARRAYIDX]], align 1
-; INDVARCHECK-NEXT: br label [[LOOP1_INC]]
-; INDVARCHECK: loop1.inc:
-; INDVARCHECK-NEXT: [[INX_2]] = phi i32 [ [[INC]], [[WHILE_END]] ], [ [[INX_1]], [[LOOP1_BODY]] ]
-; INDVARCHECK-NEXT: [[OUTER_I_1]] = add nuw nsw i32 [[OUTER_I]], 1
-; INDVARCHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[OUTER_I_1]], [[LIM]]
-; INDVARCHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT_LOOPEXIT:%.*]], label [[LOOP1_BODY]]
-; INDVARCHECK: exit.loopexit:
-; INDVARCHECK-NEXT: br label [[EXIT]]
-; INDVARCHECK: exit:
-; INDVARCHECK-NEXT: ret void
+; CHECK: ret void
;
entry:
%cmp1 = icmp sgt i32 %lim, 0
;
; Verify that the 'udiv' is hoisted to the preheader, and is not in the loop body.
define i32 @NonZeroDivHoist(i32* nocapture readonly %ptr, i32 %start1, i32 %start2) {
-; CHECK-LABEL: @NonZeroDivHoist(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = sub i32 10, [[START2:%.*]]
-; CHECK-NEXT: br label [[FOR_COND:%.*]]
-; CHECK: for.cond:
-; CHECK-NEXT: [[VAL_0:%.*]] = phi i32 [ [[START1:%.*]], [[ENTRY:%.*]] ], [ [[VAL_1_LCSSA:%.*]], [[FOR_END:%.*]] ]
-; CHECK-NEXT: [[COUNTER1_0:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ [[INC9:%.*]], [[FOR_END]] ]
-; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[COUNTER1_0]], 100
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END10:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[TMP:%.*]] = load i32, i32* [[PTR:%.*]], align 4
-; CHECK-NEXT: [[ADD:%.*]] = add i32 [[TMP]], [[VAL_0]]
-; CHECK-NEXT: [[CMP224:%.*]] = icmp ult i32 [[START2]], 10
-; CHECK-NEXT: br i1 [[CMP224]], label [[FOR_BODY3_LR_PH:%.*]], label [[FOR_END]]
-; CHECK: for.body3.lr.ph:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP0]], 1
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 2
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT: [[TMP1:%.*]] = udiv i32 16, [[COUNTER1_0]]
-; CHECK-NEXT: [[IND_END:%.*]] = mul i32 [[N_VEC]], [[TMP1]]
-; CHECK-NEXT: [[IND_END2:%.*]] = add i32 [[START2]], [[N_VEC]]
-; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP0]], 1
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> zeroinitializer, i32 [[ADD]], i32 0
-; CHECK-NEXT: [[TMP3:%.*]] = udiv i32 16, [[COUNTER1_0]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ [[TMP2]], [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[PRED_LOAD_CONTINUE6]] ]
-; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i32 [[INDEX]], [[TMP3]]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x i32> poison, i32 [[INDEX]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT3]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[VEC_IV:%.*]] = add <2 x i32> [[BROADCAST_SPLAT4]], <i32 0, i32 1>
-; CHECK-NEXT: [[TMP4:%.*]] = icmp ule <2 x i32> [[VEC_IV]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
-; CHECK-NEXT: br i1 [[TMP5]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; CHECK: pred.load.if:
-; CHECK-NEXT: [[TMP6:%.*]] = mul i32 0, [[TMP3]]
-; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[OFFSET_IDX]], [[TMP6]]
-; CHECK-NEXT: [[TMP8:%.*]] = udiv i32 16, [[COUNTER1_0]]
-; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP8]], [[TMP7]]
-; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[TMP10]]
-; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4
-; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP12]], i32 0
-; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]]
-; CHECK: pred.load.continue:
-; CHECK-NEXT: [[TMP14:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_LOAD_IF]] ]
-; CHECK-NEXT: [[TMP15:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP13]], [[PRED_LOAD_IF]] ]
-; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
-; CHECK-NEXT: br i1 [[TMP16]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6]]
-; CHECK: pred.load.if5:
-; CHECK-NEXT: [[TMP17:%.*]] = mul i32 1, [[TMP3]]
-; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[OFFSET_IDX]], [[TMP17]]
-; CHECK-NEXT: [[TMP19:%.*]] = udiv i32 16, [[COUNTER1_0]]
-; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP19]], [[TMP18]]
-; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP20]] to i64
-; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[TMP21]]
-; CHECK-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4
-; CHECK-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> [[TMP15]], i32 [[TMP23]], i32 1
-; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE6]]
-; CHECK: pred.load.continue6:
-; CHECK-NEXT: [[TMP25:%.*]] = phi i32 [ poison, [[PRED_LOAD_CONTINUE]] ], [ [[TMP19]], [[PRED_LOAD_IF5]] ]
-; CHECK-NEXT: [[TMP26:%.*]] = phi <2 x i32> [ [[TMP15]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP24]], [[PRED_LOAD_IF5]] ]
-; CHECK-NEXT: [[TMP27]] = add <2 x i32> [[TMP26]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP28:%.*]] = select <2 x i1> [[TMP4]], <2 x i32> [[TMP27]], <2 x i32> [[VEC_PHI]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2
-; CHECK-NEXT: [[TMP29:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[TMP30:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP28]])
-; CHECK-NEXT: br i1 true, label [[FOR_COND1_FOR_END_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY3_LR_PH]] ]
-; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[START2]], [[FOR_BODY3_LR_PH]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY3_LR_PH]] ], [ [[TMP30]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR_BODY3:%.*]]
-; CHECK: for.body3:
-; CHECK-NEXT: [[INDEX_027:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD4:%.*]], [[FOR_BODY3]] ]
-; CHECK-NEXT: [[VAL_126:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD7:%.*]], [[FOR_BODY3]] ]
-; CHECK-NEXT: [[COUNTER2_025:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY3]] ]
-; CHECK-NEXT: [[DIV:%.*]] = udiv i32 16, [[COUNTER1_0]]
-; CHECK-NEXT: [[ADD4]] = add i32 [[DIV]], [[INDEX_027]]
-; CHECK-NEXT: [[IDXPROM5:%.*]] = zext i32 [[ADD4]] to i64
-; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[IDXPROM5]]
-; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
-; CHECK-NEXT: [[ADD7]] = add i32 [[TMP1]], [[VAL_126]]
-; CHECK-NEXT: [[INC]] = add i32 [[COUNTER2_025]], 1
-; CHECK-NEXT: [[CMP2:%.*]] = icmp ult i32 [[INC]], 10
-; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_BODY3]], label [[FOR_COND1_FOR_END_CRIT_EDGE]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK: for.cond1.for.end_crit_edge:
-; CHECK-NEXT: [[SPLIT:%.*]] = phi i32 [ [[ADD7]], [[FOR_BODY3]] ], [ [[TMP30]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR_END]]
-; CHECK: for.end:
-; CHECK-NEXT: [[VAL_1_LCSSA]] = phi i32 [ [[SPLIT]], [[FOR_COND1_FOR_END_CRIT_EDGE]] ], [ [[ADD]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[INC9]] = add i32 [[COUNTER1_0]], 1
-; CHECK-NEXT: br label [[FOR_COND]]
-; CHECK: for.end10:
-; CHECK-NEXT: [[VAL_0_LCSSA:%.*]] = phi i32 [ [[VAL_0]], [[FOR_COND]] ]
-; CHECK-NEXT: ret i32 [[VAL_0_LCSSA]]
-;
; INDVARCHECK-LABEL: @NonZeroDivHoist(
; INDVARCHECK-NEXT: entry:
-; INDVARCHECK-NEXT: br label [[FOR_COND:%.*]]
-; INDVARCHECK: for.cond:
-; INDVARCHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_END:%.*]] ], [ 1, [[ENTRY:%.*]] ]
-; INDVARCHECK-NEXT: [[VAL_0:%.*]] = phi i32 [ [[START1:%.*]], [[ENTRY]] ], [ [[VAL_1_LCSSA:%.*]], [[FOR_END]] ]
-; INDVARCHECK-NEXT: [[EXITCOND4:%.*]] = icmp ne i64 [[INDVARS_IV]], 100
-; INDVARCHECK-NEXT: br i1 [[EXITCOND4]], label [[FOR_BODY:%.*]], label [[FOR_END10:%.*]]
-; INDVARCHECK: for.body:
-; INDVARCHECK-NEXT: [[TMP:%.*]] = load i32, i32* [[PTR:%.*]], align 4
-; INDVARCHECK-NEXT: [[ADD:%.*]] = add i32 [[TMP]], [[VAL_0]]
-; INDVARCHECK-NEXT: [[CMP224:%.*]] = icmp ult i32 [[START2:%.*]], 10
-; INDVARCHECK-NEXT: br i1 [[CMP224]], label [[FOR_BODY3_LR_PH:%.*]], label [[FOR_END]]
; INDVARCHECK: for.body3.lr.ph:
-; INDVARCHECK-NEXT: [[TMP0:%.*]] = udiv i64 16, [[INDVARS_IV]]
+; INDVARCHECK-NEXT: [[TMP0:%.*]] = udiv i64 16, [[INDVARS_IV:%.*]]
; INDVARCHECK-NEXT: br label [[FOR_BODY3:%.*]]
; INDVARCHECK: for.body3:
-; INDVARCHECK-NEXT: [[INDVARS_IV1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT2:%.*]], [[FOR_BODY3]] ], [ 0, [[FOR_BODY3_LR_PH]] ]
-; INDVARCHECK-NEXT: [[VAL_126:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY3_LR_PH]] ], [ [[ADD7:%.*]], [[FOR_BODY3]] ]
-; INDVARCHECK-NEXT: [[COUNTER2_025:%.*]] = phi i32 [ [[START2]], [[FOR_BODY3_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY3]] ]
-; INDVARCHECK-NEXT: [[INDVARS_IV_NEXT2]] = add nuw nsw i64 [[INDVARS_IV1]], [[TMP0]]
-; INDVARCHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[INDVARS_IV_NEXT2]]
-; INDVARCHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
-; INDVARCHECK-NEXT: [[ADD7]] = add i32 [[TMP1]], [[VAL_126]]
-; INDVARCHECK-NEXT: [[INC]] = add nuw nsw i32 [[COUNTER2_025]], 1
-; INDVARCHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i32 [[INC]], 10
-; INDVARCHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY3]], label [[FOR_COND1_FOR_END_CRIT_EDGE:%.*]]
-; INDVARCHECK: for.cond1.for.end_crit_edge:
-; INDVARCHECK-NEXT: [[SPLIT:%.*]] = phi i32 [ [[ADD7]], [[FOR_BODY3]] ]
-; INDVARCHECK-NEXT: br label [[FOR_END]]
-; INDVARCHECK: for.end:
-; INDVARCHECK-NEXT: [[VAL_1_LCSSA]] = phi i32 [ [[SPLIT]], [[FOR_COND1_FOR_END_CRIT_EDGE]] ], [ [[ADD]], [[FOR_BODY]] ]
-; INDVARCHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; INDVARCHECK-NEXT: br label [[FOR_COND]]
+; INDVARCHECK-NOT: udiv
; INDVARCHECK: for.end10:
-; INDVARCHECK-NEXT: [[VAL_0_LCSSA:%.*]] = phi i32 [ [[VAL_0]], [[FOR_COND]] ]
-; INDVARCHECK-NEXT: ret i32 [[VAL_0_LCSSA]]
;
entry:
br label %for.cond
; Verify that the 'udiv' is not hoisted to the preheader, and it remains in the
; loop body.
define i32 @ZeroDivNoHoist(i32* nocapture readonly %ptr, i32 %start1, i32 %start2) {
-; CHECK-LABEL: @ZeroDivNoHoist(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = sub i32 10, [[START2:%.*]]
-; CHECK-NEXT: br label [[FOR_COND:%.*]]
-; CHECK: for.cond:
-; CHECK-NEXT: [[VAL_0:%.*]] = phi i32 [ [[START1:%.*]], [[ENTRY:%.*]] ], [ [[VAL_1_LCSSA:%.*]], [[FOR_END:%.*]] ]
-; CHECK-NEXT: [[COUNTER1_0:%.*]] = phi i32 [ [[START1]], [[ENTRY]] ], [ [[INC9:%.*]], [[FOR_END]] ]
-; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[COUNTER1_0]], 100
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END10:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[TMP:%.*]] = load i32, i32* [[PTR:%.*]], align 4
-; CHECK-NEXT: [[ADD:%.*]] = add i32 [[TMP]], [[VAL_0]]
-; CHECK-NEXT: [[CMP224:%.*]] = icmp ult i32 [[START2]], 10
-; CHECK-NEXT: br i1 [[CMP224]], label [[FOR_BODY3_LR_PH:%.*]], label [[FOR_END]]
-; CHECK: for.body3.lr.ph:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP0]], 1
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 2
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT: [[TMP1:%.*]] = udiv i32 16, [[COUNTER1_0]]
-; CHECK-NEXT: [[IND_END:%.*]] = mul i32 [[N_VEC]], [[TMP1]]
-; CHECK-NEXT: [[IND_END2:%.*]] = add i32 [[START2]], [[N_VEC]]
-; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP0]], 1
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> zeroinitializer, i32 [[ADD]], i32 0
-; CHECK-NEXT: [[TMP3:%.*]] = udiv i32 16, [[COUNTER1_0]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ [[TMP2]], [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[PRED_LOAD_CONTINUE6]] ]
-; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i32 [[INDEX]], [[TMP3]]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x i32> poison, i32 [[INDEX]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT3]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[VEC_IV:%.*]] = add <2 x i32> [[BROADCAST_SPLAT4]], <i32 0, i32 1>
-; CHECK-NEXT: [[TMP4:%.*]] = icmp ule <2 x i32> [[VEC_IV]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
-; CHECK-NEXT: br i1 [[TMP5]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; CHECK: pred.load.if:
-; CHECK-NEXT: [[TMP6:%.*]] = mul i32 0, [[TMP3]]
-; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[OFFSET_IDX]], [[TMP6]]
-; CHECK-NEXT: [[TMP8:%.*]] = udiv i32 16, [[COUNTER1_0]]
-; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP8]], [[TMP7]]
-; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[TMP10]]
-; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4
-; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP12]], i32 0
-; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]]
-; CHECK: pred.load.continue:
-; CHECK-NEXT: [[TMP14:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_LOAD_IF]] ]
-; CHECK-NEXT: [[TMP15:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP13]], [[PRED_LOAD_IF]] ]
-; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
-; CHECK-NEXT: br i1 [[TMP16]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6]]
-; CHECK: pred.load.if5:
-; CHECK-NEXT: [[TMP17:%.*]] = mul i32 1, [[TMP3]]
-; CHECK-NEXT: [[TMP18:%.*]] = add i32 [[OFFSET_IDX]], [[TMP17]]
-; CHECK-NEXT: [[TMP19:%.*]] = udiv i32 16, [[COUNTER1_0]]
-; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP19]], [[TMP18]]
-; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP20]] to i64
-; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[TMP21]]
-; CHECK-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP22]], align 4
-; CHECK-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> [[TMP15]], i32 [[TMP23]], i32 1
-; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE6]]
-; CHECK: pred.load.continue6:
-; CHECK-NEXT: [[TMP25:%.*]] = phi i32 [ poison, [[PRED_LOAD_CONTINUE]] ], [ [[TMP19]], [[PRED_LOAD_IF5]] ]
-; CHECK-NEXT: [[TMP26:%.*]] = phi <2 x i32> [ [[TMP15]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP24]], [[PRED_LOAD_IF5]] ]
-; CHECK-NEXT: [[TMP27]] = add <2 x i32> [[TMP26]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP28:%.*]] = select <2 x i1> [[TMP4]], <2 x i32> [[TMP27]], <2 x i32> [[VEC_PHI]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2
-; CHECK-NEXT: [[TMP29:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[TMP30:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP28]])
-; CHECK-NEXT: br i1 true, label [[FOR_COND1_FOR_END_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY3_LR_PH]] ]
-; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[START2]], [[FOR_BODY3_LR_PH]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY3_LR_PH]] ], [ [[TMP30]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR_BODY3:%.*]]
-; CHECK: for.body3:
-; CHECK-NEXT: [[INDEX_027:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD4:%.*]], [[FOR_BODY3]] ]
-; CHECK-NEXT: [[VAL_126:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD7:%.*]], [[FOR_BODY3]] ]
-; CHECK-NEXT: [[COUNTER2_025:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY3]] ]
-; CHECK-NEXT: [[DIV:%.*]] = udiv i32 16, [[COUNTER1_0]]
-; CHECK-NEXT: [[ADD4]] = add i32 [[DIV]], [[INDEX_027]]
-; CHECK-NEXT: [[IDXPROM5:%.*]] = zext i32 [[ADD4]] to i64
-; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[IDXPROM5]]
-; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
-; CHECK-NEXT: [[ADD7]] = add i32 [[TMP1]], [[VAL_126]]
-; CHECK-NEXT: [[INC]] = add i32 [[COUNTER2_025]], 1
-; CHECK-NEXT: [[CMP2:%.*]] = icmp ult i32 [[INC]], 10
-; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_BODY3]], label [[FOR_COND1_FOR_END_CRIT_EDGE]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK: for.cond1.for.end_crit_edge:
-; CHECK-NEXT: [[SPLIT:%.*]] = phi i32 [ [[ADD7]], [[FOR_BODY3]] ], [ [[TMP30]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR_END]]
-; CHECK: for.end:
-; CHECK-NEXT: [[VAL_1_LCSSA]] = phi i32 [ [[SPLIT]], [[FOR_COND1_FOR_END_CRIT_EDGE]] ], [ [[ADD]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[INC9]] = add i32 [[COUNTER1_0]], 1
-; CHECK-NEXT: br label [[FOR_COND]]
-; CHECK: for.end10:
-; CHECK-NEXT: [[VAL_0_LCSSA:%.*]] = phi i32 [ [[VAL_0]], [[FOR_COND]] ]
-; CHECK-NEXT: ret i32 [[VAL_0_LCSSA]]
-;
; INDVARCHECK-LABEL: @ZeroDivNoHoist(
; INDVARCHECK-NEXT: entry:
-; INDVARCHECK-NEXT: [[TMP0:%.*]] = zext i32 [[START1:%.*]] to i64
-; INDVARCHECK-NEXT: br label [[FOR_COND:%.*]]
-; INDVARCHECK: for.cond:
-; INDVARCHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_END:%.*]] ], [ [[TMP0]], [[ENTRY:%.*]] ]
-; INDVARCHECK-NEXT: [[VAL_0:%.*]] = phi i32 [ [[START1]], [[ENTRY]] ], [ [[VAL_1_LCSSA:%.*]], [[FOR_END]] ]
-; INDVARCHECK-NEXT: [[INDVARS3:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; INDVARCHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[INDVARS3]], 100
-; INDVARCHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END10:%.*]]
-; INDVARCHECK: for.body:
-; INDVARCHECK-NEXT: [[TMP:%.*]] = load i32, i32* [[PTR:%.*]], align 4
-; INDVARCHECK-NEXT: [[ADD:%.*]] = add i32 [[TMP]], [[VAL_0]]
-; INDVARCHECK-NEXT: [[CMP224:%.*]] = icmp ult i32 [[START2:%.*]], 10
-; INDVARCHECK-NEXT: br i1 [[CMP224]], label [[FOR_BODY3_LR_PH:%.*]], label [[FOR_END]]
-; INDVARCHECK: for.body3.lr.ph:
-; INDVARCHECK-NEXT: br label [[FOR_BODY3:%.*]]
+; INDVARCHECK-NOT: udiv
; INDVARCHECK: for.body3:
-; INDVARCHECK-NEXT: [[INDVARS_IV1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT2:%.*]], [[FOR_BODY3]] ], [ 0, [[FOR_BODY3_LR_PH]] ]
-; INDVARCHECK-NEXT: [[VAL_126:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY3_LR_PH]] ], [ [[ADD7:%.*]], [[FOR_BODY3]] ]
-; INDVARCHECK-NEXT: [[COUNTER2_025:%.*]] = phi i32 [ [[START2]], [[FOR_BODY3_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY3]] ]
-; INDVARCHECK-NEXT: [[TMP1:%.*]] = udiv i64 16, [[INDVARS_IV]]
-; INDVARCHECK-NEXT: [[INDVARS_IV_NEXT2]] = add nuw nsw i64 [[INDVARS_IV1]], [[TMP1]]
-; INDVARCHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[INDVARS_IV_NEXT2]]
-; INDVARCHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
-; INDVARCHECK-NEXT: [[ADD7]] = add i32 [[TMP1]], [[VAL_126]]
-; INDVARCHECK-NEXT: [[INC]] = add nuw nsw i32 [[COUNTER2_025]], 1
-; INDVARCHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i32 [[INC]], 10
-; INDVARCHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY3]], label [[FOR_COND1_FOR_END_CRIT_EDGE:%.*]]
+; INDVARCHECK: [[TMP1:%.*]] = udiv i64 16, [[INDVARS_IV:%.*]]
; INDVARCHECK: for.cond1.for.end_crit_edge:
-; INDVARCHECK-NEXT: [[SPLIT:%.*]] = phi i32 [ [[ADD7]], [[FOR_BODY3]] ]
-; INDVARCHECK-NEXT: br label [[FOR_END]]
-; INDVARCHECK: for.end:
-; INDVARCHECK-NEXT: [[VAL_1_LCSSA]] = phi i32 [ [[SPLIT]], [[FOR_COND1_FOR_END_CRIT_EDGE]] ], [ [[ADD]], [[FOR_BODY]] ]
-; INDVARCHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; INDVARCHECK-NEXT: br label [[FOR_COND]]
-; INDVARCHECK: for.end10:
-; INDVARCHECK-NEXT: [[VAL_0_LCSSA:%.*]] = phi i32 [ [[VAL_0]], [[FOR_COND]] ]
-; INDVARCHECK-NEXT: ret i32 [[VAL_0_LCSSA]]
;
entry:
br label %for.cond
; Verify that the division-operation is hoisted, and that it appears as a
; right-shift ('lshr') rather than an explicit division.
define i32 @DivBy16Hoist(i32* nocapture readonly %ptr, i32 %start1, i32 %start2) {
-; CHECK-LABEL: @DivBy16Hoist(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = sub i32 10, [[START2:%.*]]
-; CHECK-NEXT: br label [[FOR_COND:%.*]]
-; CHECK: for.cond:
-; CHECK-NEXT: [[VAL_0:%.*]] = phi i32 [ [[START1:%.*]], [[ENTRY:%.*]] ], [ [[VAL_1_LCSSA:%.*]], [[FOR_END:%.*]] ]
-; CHECK-NEXT: [[COUNTER1_0:%.*]] = phi i32 [ [[START1]], [[ENTRY]] ], [ [[INC9:%.*]], [[FOR_END]] ]
-; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 [[COUNTER1_0]], 4
-; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[COUNTER1_0]], 4
-; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[COUNTER1_0]], 100
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END10:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[TMP:%.*]] = load i32, i32* [[PTR:%.*]], align 4
-; CHECK-NEXT: [[ADD:%.*]] = add i32 [[TMP]], [[VAL_0]]
-; CHECK-NEXT: [[CMP224:%.*]] = icmp ult i32 [[START2]], 10
-; CHECK-NEXT: br i1 [[CMP224]], label [[FOR_BODY3_LR_PH:%.*]], label [[FOR_END]]
-; CHECK: for.body3.lr.ph:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP0]], 1
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 2
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT: [[IND_END:%.*]] = mul i32 [[N_VEC]], [[TMP2]]
-; CHECK-NEXT: [[IND_END2:%.*]] = add i32 [[START2]], [[N_VEC]]
-; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP0]], 1
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> zeroinitializer, i32 [[ADD]], i32 0
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ [[TMP3]], [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[PRED_LOAD_CONTINUE6]] ]
-; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i32 [[INDEX]], [[TMP1]]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x i32> poison, i32 [[INDEX]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT3]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[VEC_IV:%.*]] = add <2 x i32> [[BROADCAST_SPLAT4]], <i32 0, i32 1>
-; CHECK-NEXT: [[TMP4:%.*]] = icmp ule <2 x i32> [[VEC_IV]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
-; CHECK-NEXT: br i1 [[TMP5]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; CHECK: pred.load.if:
-; CHECK-NEXT: [[TMP6:%.*]] = mul i32 0, [[TMP1]]
-; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[OFFSET_IDX]], [[TMP6]]
-; CHECK-NEXT: [[TMP8:%.*]] = udiv i32 [[COUNTER1_0]], 16
-; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP8]], [[TMP7]]
-; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[TMP10]]
-; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4
-; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP12]], i32 0
-; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]]
-; CHECK: pred.load.continue:
-; CHECK-NEXT: [[TMP14:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP13]], [[PRED_LOAD_IF]] ]
-; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
-; CHECK-NEXT: br i1 [[TMP15]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6]]
-; CHECK: pred.load.if5:
-; CHECK-NEXT: [[TMP16:%.*]] = mul i32 1, [[TMP1]]
-; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[OFFSET_IDX]], [[TMP16]]
-; CHECK-NEXT: [[TMP18:%.*]] = udiv i32 [[COUNTER1_0]], 16
-; CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], [[TMP17]]
-; CHECK-NEXT: [[TMP20:%.*]] = zext i32 [[TMP19]] to i64
-; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[TMP20]]
-; CHECK-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4
-; CHECK-NEXT: [[TMP23:%.*]] = insertelement <2 x i32> [[TMP14]], i32 [[TMP22]], i32 1
-; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE6]]
-; CHECK: pred.load.continue6:
-; CHECK-NEXT: [[TMP24:%.*]] = phi <2 x i32> [ [[TMP14]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP23]], [[PRED_LOAD_IF5]] ]
-; CHECK-NEXT: [[TMP25]] = add <2 x i32> [[TMP24]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP26:%.*]] = select <2 x i1> [[TMP4]], <2 x i32> [[TMP25]], <2 x i32> [[VEC_PHI]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2
-; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[TMP28:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP26]])
-; CHECK-NEXT: br i1 true, label [[FOR_COND1_FOR_END_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY3_LR_PH]] ]
-; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[START2]], [[FOR_BODY3_LR_PH]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY3_LR_PH]] ], [ [[TMP28]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR_BODY3:%.*]]
-; CHECK: for.body3:
-; CHECK-NEXT: [[INDEX_027:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD4:%.*]], [[FOR_BODY3]] ]
-; CHECK-NEXT: [[VAL_126:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD7:%.*]], [[FOR_BODY3]] ]
-; CHECK-NEXT: [[COUNTER2_025:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY3]] ]
-; CHECK-NEXT: [[DIV:%.*]] = udiv i32 [[COUNTER1_0]], 16
-; CHECK-NEXT: [[ADD4]] = add i32 [[DIV]], [[INDEX_027]]
-; CHECK-NEXT: [[IDXPROM5:%.*]] = zext i32 [[ADD4]] to i64
-; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[IDXPROM5]]
-; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
-; CHECK-NEXT: [[ADD7]] = add i32 [[TMP1]], [[VAL_126]]
-; CHECK-NEXT: [[INC]] = add i32 [[COUNTER2_025]], 1
-; CHECK-NEXT: [[CMP2:%.*]] = icmp ult i32 [[INC]], 10
-; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_BODY3]], label [[FOR_COND1_FOR_END_CRIT_EDGE]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK: for.cond1.for.end_crit_edge:
-; CHECK-NEXT: [[SPLIT:%.*]] = phi i32 [ [[ADD7]], [[FOR_BODY3]] ], [ [[TMP28]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR_END]]
-; CHECK: for.end:
-; CHECK-NEXT: [[VAL_1_LCSSA]] = phi i32 [ [[SPLIT]], [[FOR_COND1_FOR_END_CRIT_EDGE]] ], [ [[ADD]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[INC9]] = add i32 [[COUNTER1_0]], 1
-; CHECK-NEXT: br label [[FOR_COND]]
-; CHECK: for.end10:
-; CHECK-NEXT: [[VAL_0_LCSSA:%.*]] = phi i32 [ [[VAL_0]], [[FOR_COND]] ]
-; CHECK-NEXT: ret i32 [[VAL_0_LCSSA]]
-;
; INDVARCHECK-LABEL: @DivBy16Hoist(
; INDVARCHECK-NEXT: entry:
-; INDVARCHECK-NEXT: [[TMP0:%.*]] = zext i32 [[START1:%.*]] to i64
-; INDVARCHECK-NEXT: br label [[FOR_COND:%.*]]
; INDVARCHECK: for.cond:
-; INDVARCHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_END:%.*]] ], [ [[TMP0]], [[ENTRY:%.*]] ]
-; INDVARCHECK-NEXT: [[VAL_0:%.*]] = phi i32 [ [[START1]], [[ENTRY]] ], [ [[VAL_1_LCSSA:%.*]], [[FOR_END]] ]
-; INDVARCHECK-NEXT: [[INDVARS3:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; INDVARCHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[INDVARS_IV]], 4
-; INDVARCHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[INDVARS3]], 100
-; INDVARCHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END10:%.*]]
+; INDVARCHECK: [[TMP1:%.*]] = lshr i64 [[INDVARS_IV:%.*]], 4
; INDVARCHECK: for.body:
-; INDVARCHECK-NEXT: [[TMP:%.*]] = load i32, i32* [[PTR:%.*]], align 4
-; INDVARCHECK-NEXT: [[ADD:%.*]] = add i32 [[TMP]], [[VAL_0]]
-; INDVARCHECK-NEXT: [[CMP224:%.*]] = icmp ult i32 [[START2:%.*]], 10
-; INDVARCHECK-NEXT: br i1 [[CMP224]], label [[FOR_BODY3_LR_PH:%.*]], label [[FOR_END]]
-; INDVARCHECK: for.body3.lr.ph:
-; INDVARCHECK-NEXT: br label [[FOR_BODY3:%.*]]
-; INDVARCHECK: for.body3:
-; INDVARCHECK-NEXT: [[INDVARS_IV1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT2:%.*]], [[FOR_BODY3]] ], [ 0, [[FOR_BODY3_LR_PH]] ]
-; INDVARCHECK-NEXT: [[VAL_126:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY3_LR_PH]] ], [ [[ADD7:%.*]], [[FOR_BODY3]] ]
-; INDVARCHECK-NEXT: [[COUNTER2_025:%.*]] = phi i32 [ [[START2]], [[FOR_BODY3_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY3]] ]
-; INDVARCHECK-NEXT: [[INDVARS_IV_NEXT2]] = add nuw nsw i64 [[INDVARS_IV1]], [[TMP1]]
-; INDVARCHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[INDVARS_IV_NEXT2]]
-; INDVARCHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
-; INDVARCHECK-NEXT: [[ADD7]] = add i32 [[TMP1]], [[VAL_126]]
-; INDVARCHECK-NEXT: [[INC]] = add nuw nsw i32 [[COUNTER2_025]], 1
-; INDVARCHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i32 [[INC]], 10
-; INDVARCHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY3]], label [[FOR_COND1_FOR_END_CRIT_EDGE:%.*]]
-; INDVARCHECK: for.cond1.for.end_crit_edge:
-; INDVARCHECK-NEXT: [[SPLIT:%.*]] = phi i32 [ [[ADD7]], [[FOR_BODY3]] ]
-; INDVARCHECK-NEXT: br label [[FOR_END]]
-; INDVARCHECK: for.end:
-; INDVARCHECK-NEXT: [[VAL_1_LCSSA]] = phi i32 [ [[SPLIT]], [[FOR_COND1_FOR_END_CRIT_EDGE]] ], [ [[ADD]], [[FOR_BODY]] ]
-; INDVARCHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; INDVARCHECK-NEXT: br label [[FOR_COND]]
+; INDVARCHECK-NOT: lshr
+; INDVARCHECK-NOT: udiv
; INDVARCHECK: for.end10:
-; INDVARCHECK-NEXT: [[VAL_0_LCSSA:%.*]] = phi i32 [ [[VAL_0]], [[FOR_COND]] ]
-; INDVARCHECK-NEXT: ret i32 [[VAL_0_LCSSA]]
;
entry:
br label %for.cond
;
; Verify that the division-operation is hoisted.
define i32 @DivBy17Hoist(i32* nocapture readonly %ptr, i32 %start1, i32 %start2) {
-; CHECK-LABEL: @DivBy17Hoist(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = sub i32 10, [[START2:%.*]]
-; CHECK-NEXT: br label [[FOR_COND:%.*]]
-; CHECK: for.cond:
-; CHECK-NEXT: [[VAL_0:%.*]] = phi i32 [ [[START1:%.*]], [[ENTRY:%.*]] ], [ [[VAL_1_LCSSA:%.*]], [[FOR_END:%.*]] ]
-; CHECK-NEXT: [[COUNTER1_0:%.*]] = phi i32 [ [[START1]], [[ENTRY]] ], [ [[INC9:%.*]], [[FOR_END]] ]
-; CHECK-NEXT: [[TMP1:%.*]] = udiv i32 [[COUNTER1_0]], 17
-; CHECK-NEXT: [[TMP2:%.*]] = udiv i32 [[COUNTER1_0]], 17
-; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[COUNTER1_0]], 100
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END10:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[TMP:%.*]] = load i32, i32* [[PTR:%.*]], align 4
-; CHECK-NEXT: [[ADD:%.*]] = add i32 [[TMP]], [[VAL_0]]
-; CHECK-NEXT: [[CMP224:%.*]] = icmp ult i32 [[START2]], 10
-; CHECK-NEXT: br i1 [[CMP224]], label [[FOR_BODY3_LR_PH:%.*]], label [[FOR_END]]
-; CHECK: for.body3.lr.ph:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP0]], 1
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 2
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
-; CHECK-NEXT: [[IND_END:%.*]] = mul i32 [[N_VEC]], [[TMP2]]
-; CHECK-NEXT: [[IND_END2:%.*]] = add i32 [[START2]], [[N_VEC]]
-; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP0]], 1
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> zeroinitializer, i32 [[ADD]], i32 0
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ [[TMP3]], [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[PRED_LOAD_CONTINUE6]] ]
-; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i32 [[INDEX]], [[TMP1]]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x i32> poison, i32 [[INDEX]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT3]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[VEC_IV:%.*]] = add <2 x i32> [[BROADCAST_SPLAT4]], <i32 0, i32 1>
-; CHECK-NEXT: [[TMP4:%.*]] = icmp ule <2 x i32> [[VEC_IV]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
-; CHECK-NEXT: br i1 [[TMP5]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; CHECK: pred.load.if:
-; CHECK-NEXT: [[TMP6:%.*]] = mul i32 0, [[TMP1]]
-; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[OFFSET_IDX]], [[TMP6]]
-; CHECK-NEXT: [[TMP8:%.*]] = udiv i32 [[COUNTER1_0]], 17
-; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP8]], [[TMP7]]
-; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[TMP10]]
-; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4
-; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP12]], i32 0
-; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]]
-; CHECK: pred.load.continue:
-; CHECK-NEXT: [[TMP14:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP13]], [[PRED_LOAD_IF]] ]
-; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
-; CHECK-NEXT: br i1 [[TMP15]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6]]
-; CHECK: pred.load.if5:
-; CHECK-NEXT: [[TMP16:%.*]] = mul i32 1, [[TMP1]]
-; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[OFFSET_IDX]], [[TMP16]]
-; CHECK-NEXT: [[TMP18:%.*]] = udiv i32 [[COUNTER1_0]], 17
-; CHECK-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], [[TMP17]]
-; CHECK-NEXT: [[TMP20:%.*]] = zext i32 [[TMP19]] to i64
-; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[TMP20]]
-; CHECK-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4
-; CHECK-NEXT: [[TMP23:%.*]] = insertelement <2 x i32> [[TMP14]], i32 [[TMP22]], i32 1
-; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE6]]
-; CHECK: pred.load.continue6:
-; CHECK-NEXT: [[TMP24:%.*]] = phi <2 x i32> [ [[TMP14]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP23]], [[PRED_LOAD_IF5]] ]
-; CHECK-NEXT: [[TMP25]] = add <2 x i32> [[TMP24]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP26:%.*]] = select <2 x i1> [[TMP4]], <2 x i32> [[TMP25]], <2 x i32> [[VEC_PHI]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2
-; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[TMP28:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[TMP26]])
-; CHECK-NEXT: br i1 true, label [[FOR_COND1_FOR_END_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY3_LR_PH]] ]
-; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[START2]], [[FOR_BODY3_LR_PH]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY3_LR_PH]] ], [ [[TMP28]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR_BODY3:%.*]]
-; CHECK: for.body3:
-; CHECK-NEXT: [[INDEX_027:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD4:%.*]], [[FOR_BODY3]] ]
-; CHECK-NEXT: [[VAL_126:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD7:%.*]], [[FOR_BODY3]] ]
-; CHECK-NEXT: [[COUNTER2_025:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY3]] ]
-; CHECK-NEXT: [[DIV:%.*]] = udiv i32 [[COUNTER1_0]], 17
-; CHECK-NEXT: [[ADD4]] = add i32 [[DIV]], [[INDEX_027]]
-; CHECK-NEXT: [[IDXPROM5:%.*]] = zext i32 [[ADD4]] to i64
-; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[IDXPROM5]]
-; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
-; CHECK-NEXT: [[ADD7]] = add i32 [[TMP1]], [[VAL_126]]
-; CHECK-NEXT: [[INC]] = add i32 [[COUNTER2_025]], 1
-; CHECK-NEXT: [[CMP2:%.*]] = icmp ult i32 [[INC]], 10
-; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_BODY3]], label [[FOR_COND1_FOR_END_CRIT_EDGE]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK: for.cond1.for.end_crit_edge:
-; CHECK-NEXT: [[SPLIT:%.*]] = phi i32 [ [[ADD7]], [[FOR_BODY3]] ], [ [[TMP28]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[FOR_END]]
-; CHECK: for.end:
-; CHECK-NEXT: [[VAL_1_LCSSA]] = phi i32 [ [[SPLIT]], [[FOR_COND1_FOR_END_CRIT_EDGE]] ], [ [[ADD]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[INC9]] = add i32 [[COUNTER1_0]], 1
-; CHECK-NEXT: br label [[FOR_COND]]
-; CHECK: for.end10:
-; CHECK-NEXT: [[VAL_0_LCSSA:%.*]] = phi i32 [ [[VAL_0]], [[FOR_COND]] ]
-; CHECK-NEXT: ret i32 [[VAL_0_LCSSA]]
-;
; INDVARCHECK-LABEL: @DivBy17Hoist(
; INDVARCHECK-NEXT: entry:
-; INDVARCHECK-NEXT: [[TMP0:%.*]] = zext i32 [[START1:%.*]] to i64
-; INDVARCHECK-NEXT: br label [[FOR_COND:%.*]]
; INDVARCHECK: for.cond:
-; INDVARCHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_END:%.*]] ], [ [[TMP0]], [[ENTRY:%.*]] ]
-; INDVARCHECK-NEXT: [[VAL_0:%.*]] = phi i32 [ [[START1]], [[ENTRY]] ], [ [[VAL_1_LCSSA:%.*]], [[FOR_END]] ]
-; INDVARCHECK-NEXT: [[INDVARS3:%.*]] = trunc i64 [[INDVARS_IV]] to i32
-; INDVARCHECK-NEXT: [[TMP1:%.*]] = udiv i64 [[INDVARS_IV]], 17
-; INDVARCHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[INDVARS3]], 100
-; INDVARCHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END10:%.*]]
+; INDVARCHECK: [[TMP1:%.*]] = udiv i64 [[INDVARS_IV:%.*]], 17
; INDVARCHECK: for.body:
-; INDVARCHECK-NEXT: [[TMP:%.*]] = load i32, i32* [[PTR:%.*]], align 4
-; INDVARCHECK-NEXT: [[ADD:%.*]] = add i32 [[TMP]], [[VAL_0]]
-; INDVARCHECK-NEXT: [[CMP224:%.*]] = icmp ult i32 [[START2:%.*]], 10
-; INDVARCHECK-NEXT: br i1 [[CMP224]], label [[FOR_BODY3_LR_PH:%.*]], label [[FOR_END]]
-; INDVARCHECK: for.body3.lr.ph:
-; INDVARCHECK-NEXT: br label [[FOR_BODY3:%.*]]
-; INDVARCHECK: for.body3:
-; INDVARCHECK-NEXT: [[INDVARS_IV1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT2:%.*]], [[FOR_BODY3]] ], [ 0, [[FOR_BODY3_LR_PH]] ]
-; INDVARCHECK-NEXT: [[VAL_126:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY3_LR_PH]] ], [ [[ADD7:%.*]], [[FOR_BODY3]] ]
-; INDVARCHECK-NEXT: [[COUNTER2_025:%.*]] = phi i32 [ [[START2]], [[FOR_BODY3_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY3]] ]
-; INDVARCHECK-NEXT: [[INDVARS_IV_NEXT2]] = add nuw nsw i64 [[INDVARS_IV1]], [[TMP1]]
-; INDVARCHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[PTR]], i64 [[INDVARS_IV_NEXT2]]
-; INDVARCHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX6]], align 4
-; INDVARCHECK-NEXT: [[ADD7]] = add i32 [[TMP1]], [[VAL_126]]
-; INDVARCHECK-NEXT: [[INC]] = add nuw nsw i32 [[COUNTER2_025]], 1
-; INDVARCHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i32 [[INC]], 10
-; INDVARCHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY3]], label [[FOR_COND1_FOR_END_CRIT_EDGE:%.*]]
-; INDVARCHECK: for.cond1.for.end_crit_edge:
-; INDVARCHECK-NEXT: [[SPLIT:%.*]] = phi i32 [ [[ADD7]], [[FOR_BODY3]] ]
-; INDVARCHECK-NEXT: br label [[FOR_END]]
-; INDVARCHECK: for.end:
-; INDVARCHECK-NEXT: [[VAL_1_LCSSA]] = phi i32 [ [[SPLIT]], [[FOR_COND1_FOR_END_CRIT_EDGE]] ], [ [[ADD]], [[FOR_BODY]] ]
-; INDVARCHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; INDVARCHECK-NEXT: br label [[FOR_COND]]
+; INDVARCHECK-NOT: udiv
; INDVARCHECK: for.end10:
-; INDVARCHECK-NEXT: [[VAL_0_LCSSA:%.*]] = phi i32 [ [[VAL_0]], [[FOR_COND]] ]
-; INDVARCHECK-NEXT: ret i32 [[VAL_0_LCSSA]]
;
entry:
br label %for.cond
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -loop-vectorize -force-vector-width=2 -pass-remarks-missed='loop-vectorize' -S < %s 2>&1 | FileCheck %s
; CHECK: remark: <unknown>:0:0: loop not vectorized: integer loop induction variable could not be identified
;}
define void @foo(float* nocapture %ptr, float %val) local_unnamed_addr {
-; CHECK-LABEL: @foo(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[PTR_PROMOTED:%.*]] = load float, float* [[PTR:%.*]], align 4
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[ADD5:%.*]] = phi float [ [[PTR_PROMOTED]], [[ENTRY:%.*]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[F_04:%.*]] = phi float [ 0x3FB99999A0000000, [[ENTRY]] ], [ [[ADD1:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ADD]] = fadd fast float [[ADD5]], [[VAL:%.*]]
-; CHECK-NEXT: [[ADD1]] = fadd fast float [[F_04]], 0x3F847AE140000000
-; CHECK-NEXT: [[CMP:%.*]] = fcmp fast olt float [[ADD1]], 1.000000e+00
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
-; CHECK: for.end:
-; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], [[FOR_BODY]] ]
-; CHECK-NEXT: store float [[ADD_LCSSA]], float* [[PTR]], align 4
-; CHECK-NEXT: ret void
-;
entry:
%ptr.promoted = load float, float* %ptr, align 4
br label %for.body
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; REQUIRES: asserts
; RUN: opt -S -loop-vectorize -force-vector-width=8 -force-vector-interleave=1 -enable-interleaved-mem-accesses -debug-only=loop-vectorize,vectorutils -disable-output < %s 2>&1 | FileCheck %s
target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
; Ensure that we don't create interleave groups for predicated
-; strided accesses.
+; strided accesses.
; CHECK: LV: Checking a loop in 'masked_strided'
; CHECK: LV: Analyzing interleaved accesses...
; CHECK: vector.ph:
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_SREM_CONTINUE2:%.*]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = load i16, i16* @v_38, align 1
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[TMP0]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP1:%.*]] = icmp eq <2 x i16> [[BROADCAST_SPLAT]], zeroinitializer
-; CHECK-NEXT: [[TMP2:%.*]] = xor <2 x i1> [[TMP1]], <i1 true, i1 true>
-; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
-; CHECK-NEXT: br i1 [[TMP3]], label [[PRED_SREM_IF:%.*]], label [[PRED_SREM_CONTINUE:%.*]]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_SREM_CONTINUE4:%.*]] ]
+; CHECK-NEXT: [[TMP2:%.*]] = load i16, i16* @v_38, align 1
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i16> poison, i16 [[TMP2]], i32 0
+; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT1]], <2 x i16> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <2 x i16> [[BROADCAST_SPLAT2]], zeroinitializer
+; CHECK-NEXT: [[TMP5:%.*]] = xor <2 x i1> [[TMP4]], <i1 true, i1 true>
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
+; CHECK-NEXT: br i1 [[TMP6]], label [[PRED_SREM_IF:%.*]], label [[PRED_SREM_CONTINUE:%.*]]
; CHECK: pred.srem.if:
-; CHECK-NEXT: [[TMP4:%.*]] = srem i16 5786, [[TMP0]]
-; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i16> poison, i16 [[TMP4]], i32 0
+; CHECK-NEXT: [[TMP7:%.*]] = srem i16 5786, [[TMP2]]
+; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i16> poison, i16 [[TMP7]], i32 0
; CHECK-NEXT: br label [[PRED_SREM_CONTINUE]]
; CHECK: pred.srem.continue:
-; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x i16> [ poison, [[VECTOR_BODY]] ], [ [[TMP5]], [[PRED_SREM_IF]] ]
-; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
-; CHECK-NEXT: br i1 [[TMP7]], label [[PRED_SREM_IF1:%.*]], label [[PRED_SREM_CONTINUE2]]
+; CHECK-NEXT: [[TMP9:%.*]] = phi <2 x i16> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_SREM_IF]] ]
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
+; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_SREM_IF3:%.*]], label [[PRED_SREM_CONTINUE4]]
; CHECK: pred.srem.if1:
-; CHECK-NEXT: [[TMP8:%.*]] = srem i16 5786, [[TMP0]]
-; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i16> [[TMP6]], i16 [[TMP8]], i32 1
-; CHECK-NEXT: br label [[PRED_SREM_CONTINUE2]]
+; CHECK-NEXT: [[TMP11:%.*]] = srem i16 5786, [[TMP2]]
+; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i16> [[TMP9]], i16 [[TMP11]], i32 1
+; CHECK-NEXT: br label [[PRED_SREM_CONTINUE4]]
; CHECK: pred.srem.continue2:
-; CHECK-NEXT: [[TMP10:%.*]] = phi <2 x i16> [ [[TMP6]], [[PRED_SREM_CONTINUE]] ], [ [[TMP9]], [[PRED_SREM_IF1]] ]
-; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP1]], <2 x i16> <i16 5786, i16 5786>, <2 x i16> [[TMP10]]
-; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i16> [[PREDPHI]], i32 0
-; CHECK-NEXT: store i16 [[TMP11]], i16* @v_39, align 1
-; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i16> [[PREDPHI]], i32 1
-; CHECK-NEXT: store i16 [[TMP12]], i16* @v_39, align 1
+; CHECK-NEXT: [[TMP13:%.*]] = phi <2 x i16> [ [[TMP9]], [[PRED_SREM_CONTINUE]] ], [ [[TMP12]], [[PRED_SREM_IF3]] ]
+; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP4]], <2 x i16> <i16 5786, i16 5786>, <2 x i16> [[TMP13]]
+; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i16> [[PREDPHI]], i32 0
+; CHECK-NEXT: store i16 [[TMP14]], i16* @v_39, align 1
+; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i16> [[PREDPHI]], i32 1
+; CHECK-NEXT: store i16 [[TMP15]], i16* @v_39, align 1
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], 12
-; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], 12
+; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 12, 12
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-NEXT: store i16 [[COND6]], i16* @v_39, align 1
; CHECK-NEXT: [[INC7]] = add nsw i16 [[I_07]], 1
; CHECK-NEXT: [[CMP:%.*]] = icmp slt i16 [[INC7]], 111
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[EXIT]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[EXIT]], [[LOOP2:!llvm.loop !.*]]
; CHECK: exit:
; CHECK-NEXT: [[RV:%.*]] = load i16, i16* @v_39, align 1
; CHECK-NEXT: ret i16 [[RV]]
; CHECK-NEXT: [[TMP4:%.*]] = sub i64 [[TMP3]], [[ARR2]]
; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[TMP4]] to i8
; CHECK-NEXT: [[TMP6:%.*]] = add i8 1, [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = icmp slt i8 [[TMP6]], 1
-; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[TMP4]], 255
-; CHECK-NEXT: [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
-; CHECK-NEXT: br i1 [[TMP9]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT: [[TMP9:%.*]] = icmp slt i8 [[TMP6]], 1
+; CHECK-NEXT: [[TMP11:%.*]] = icmp ugt i64 [[TMP4]], 255
+; CHECK-NEXT: [[TMP12:%.*]] = or i1 [[TMP9]], [[TMP11]]
+; CHECK-NEXT: br i1 [[TMP12]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP2]], 4
; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i8> [ <i8 0, i8 1, i8 2, i8 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP10:%.*]] = add <4 x i8> [[VEC_IND]], <i8 1, i8 1, i8 1, i8 1>
-; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i8> [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[ARR]], i8 [[TMP11]]
-; CHECK-NEXT: [[TMP13:%.*]] = icmp slt <4 x i8> [[TMP10]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: [[TMP14:%.*]] = zext <4 x i1> [[TMP13]] to <4 x i8>
-; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, i8* [[TMP12]], i32 0
-; CHECK-NEXT: [[TMP16:%.*]] = bitcast i8* [[TMP15]] to <4 x i8>*
-; CHECK-NEXT: store <4 x i8> [[TMP14]], <4 x i8>* [[TMP16]], align 1
+; CHECK-NEXT: [[TMP14:%.*]] = add <4 x i8> [[VEC_IND]], <i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i8> [[TMP14]], i32 0
+; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, i8* [[ARR]], i8 [[TMP15]]
+; CHECK-NEXT: [[TMP17:%.*]] = icmp slt <4 x i8> [[TMP14]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP18:%.*]] = zext <4 x i1> [[TMP17]] to <4 x i8>
+; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, i8* [[TMP16]], i32 0
+; CHECK-NEXT: [[TMP20:%.*]] = bitcast i8* [[TMP19]] to <4 x i8>*
+; CHECK-NEXT: store <4 x i8> [[TMP18]], <4 x i8>* [[TMP20]], align 1
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], <i8 4, i8 4, i8 4, i8 4>
-; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -loop-vectorize -force-vector-width=4 -S | FileCheck %s
; Test case for PR45525. Checks that phi's with a single predecessor and a mask are supported.
; CHECK-NEXT: bb.0:
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[COND:%.*]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i32> [[VEC_IND]], <i32 3, i32 3, i32 3, i32 3>
-; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP1]], <4 x i32> <i32 7, i32 7, i32 7, i32 7>
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i32 [[TMP0]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
-; CHECK-NEXT: store <4 x i32> [[PREDPHI]], <4 x i32>* [[TMP5]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 32
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 32, 32
-; CHECK-NEXT: br i1 [[CMP_N]], label [[BB_4:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[BB_0:%.*]] ]
-; CHECK-NEXT: br label [[BB_1:%.*]]
-; CHECK: bb.1:
-; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[BB_3:%.*]] ]
-; CHECK-NEXT: br i1 [[COND]], label [[BB_3]], label [[BB_2:%.*]]
-; CHECK: bb.2:
-; CHECK-NEXT: [[SINGLE_PRED:%.*]] = phi i32 [ [[IV]], [[BB_1]] ]
-; CHECK-NEXT: [[MULT:%.*]] = mul i32 [[SINGLE_PRED]], 3
-; CHECK-NEXT: br label [[BB_3]]
-; CHECK: bb.3:
-; CHECK-NEXT: [[STORED_VALUE:%.*]] = phi i32 [ 7, [[BB_1]] ], [ [[MULT]], [[BB_2]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[ARR]], i32 [[IV]]
-; CHECK-NEXT: store i32 [[STORED_VALUE]], i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT: [[CONTINUE:%.*]] = icmp ult i32 [[IV_NEXT]], 32
-; CHECK-NEXT: br i1 [[CONTINUE]], label [[BB_1]], label [[BB_4]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: bb.4:
-; CHECK-NEXT: ret void
+; CHECK: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK: [[TMP5:%.*]] = mul <4 x i32> [[VEC_IND]], <i32 3, i32 3, i32 3, i32 3>
;
bb.0:
br label %bb.1
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
-; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
; CHECK: middle.block:
; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: store i32 13, i32* [[ARRAYIDX]], align 1
; CHECK-NEXT: [[RIVPLUS1]] = add nuw nsw i32 [[RIV]], 1
; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[RIVPLUS1]], 14
-; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop !2
; CHECK: exit:
; CHECK-NEXT: ret void
;
; VF2UF2-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
; VF2UF2-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], <i32 2, i32 2>
; VF2UF2-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
-; VF2UF2-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF2UF2-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
; VF2UF2: middle.block:
; VF2UF2-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
; VF2UF2: scalar.ph:
; VF2UF2-NEXT: store i32 13, i32* [[ARRAYIDX]], align 1
; VF2UF2-NEXT: [[RIVPLUS1]] = add nuw nsw i32 [[RIV]], 1
; VF2UF2-NEXT: [[COND:%.*]] = icmp eq i32 [[RIVPLUS1]], 14
-; VF2UF2-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
+; VF2UF2-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop !2
; VF2UF2: exit:
; VF2UF2-NEXT: ret void
;
; VF1UF4: vector.ph:
; VF1UF4-NEXT: br label [[VECTOR_BODY:%.*]]
; VF1UF4: vector.body:
-; VF1UF4-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE12:%.*]] ]
-; VF1UF4-NEXT: [[VEC_IV:%.*]] = add i32 [[INDEX]], 0
-; VF1UF4-NEXT: [[VEC_IV4:%.*]] = add i32 [[INDEX]], 1
-; VF1UF4-NEXT: [[VEC_IV5:%.*]] = add i32 [[INDEX]], 2
-; VF1UF4-NEXT: [[VEC_IV6:%.*]] = add i32 [[INDEX]], 3
-; VF1UF4-NEXT: [[TMP0:%.*]] = icmp ule i32 [[VEC_IV]], 13
-; VF1UF4-NEXT: [[TMP1:%.*]] = icmp ule i32 [[VEC_IV4]], 13
-; VF1UF4-NEXT: [[TMP2:%.*]] = icmp ule i32 [[VEC_IV5]], 13
-; VF1UF4-NEXT: [[TMP3:%.*]] = icmp ule i32 [[VEC_IV6]], 13
+; VF1UF4-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE9:%.*]] ]
+; VF1UF4-NEXT: [[INDUCTION:%.*]] = add i32 [[INDEX]], 0
+; VF1UF4-NEXT: [[INDUCTION1:%.*]] = add i32 [[INDEX]], 1
+; VF1UF4-NEXT: [[INDUCTION2:%.*]] = add i32 [[INDEX]], 2
+; VF1UF4-NEXT: [[INDUCTION3:%.*]] = add i32 [[INDEX]], 3
+; VF1UF4-NEXT: [[TMP0:%.*]] = icmp ule i32 [[INDUCTION]], 13
+; VF1UF4-NEXT: [[TMP1:%.*]] = icmp ule i32 [[INDUCTION1]], 13
+; VF1UF4-NEXT: [[TMP2:%.*]] = icmp ule i32 [[INDUCTION2]], 13
+; VF1UF4-NEXT: [[TMP3:%.*]] = icmp ule i32 [[INDUCTION3]], 13
; VF1UF4-NEXT: br i1 [[TMP0]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
; VF1UF4: pred.store.if:
-; VF1UF4-NEXT: [[INDUCTION:%.*]] = add i32 [[INDEX]], 0
-; VF1UF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[INDUCTION]]
+; VF1UF4-NEXT: [[SUNK_IND0:%.*]] = add i32 [[INDEX]], 0
+; VF1UF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[SUNK_IND0]]
; VF1UF4-NEXT: store i32 13, i32* [[TMP4]], align 1
; VF1UF4-NEXT: br label [[PRED_STORE_CONTINUE]]
; VF1UF4: pred.store.continue:
-; VF1UF4-NEXT: br i1 [[TMP1]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]]
+; VF1UF4-NEXT: br i1 [[TMP1]], label [[PRED_STORE_IF4:%.*]], label [[PRED_STORE_CONTINUE5:%.*]]
; VF1UF4: pred.store.if7:
-; VF1UF4-NEXT: [[INDUCTION1:%.*]] = add i32 [[INDEX]], 1
-; VF1UF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[INDUCTION1]]
+; VF1UF4-NEXT: [[SUNK_IND1:%.*]] = add i32 [[INDEX]], 1
+; VF1UF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[SUNK_IND1]]
; VF1UF4-NEXT: store i32 13, i32* [[TMP5]], align 1
-; VF1UF4-NEXT: br label [[PRED_STORE_CONTINUE8]]
+; VF1UF4-NEXT: br label [[PRED_STORE_CONTINUE5]]
; VF1UF4: pred.store.continue8:
-; VF1UF4-NEXT: br i1 [[TMP2]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
+; VF1UF4-NEXT: br i1 [[TMP2]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7:%.*]]
; VF1UF4: pred.store.if9:
-; VF1UF4-NEXT: [[INDUCTION2:%.*]] = add i32 [[INDEX]], 2
-; VF1UF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[INDUCTION2]]
+; VF1UF4-NEXT: [[SUNK_IND2:%.*]] = add i32 [[INDEX]], 2
+; VF1UF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[SUNK_IND2]]
; VF1UF4-NEXT: store i32 13, i32* [[TMP6]], align 1
-; VF1UF4-NEXT: br label [[PRED_STORE_CONTINUE10]]
+; VF1UF4-NEXT: br label [[PRED_STORE_CONTINUE7]]
; VF1UF4: pred.store.continue10:
-; VF1UF4-NEXT: br i1 [[TMP3]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12]]
+; VF1UF4-NEXT: br i1 [[TMP3]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9]]
; VF1UF4: pred.store.if11:
-; VF1UF4-NEXT: [[INDUCTION3:%.*]] = add i32 [[INDEX]], 3
-; VF1UF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[INDUCTION3]]
+; VF1UF4-NEXT: [[SUNK_IND3:%.*]] = add i32 [[INDEX]], 3
+; VF1UF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[SUNK_IND3]]
; VF1UF4-NEXT: store i32 13, i32* [[TMP7]], align 1
-; VF1UF4-NEXT: br label [[PRED_STORE_CONTINUE12]]
+; VF1UF4-NEXT: br label [[PRED_STORE_CONTINUE9]]
; VF1UF4: pred.store.continue12:
; VF1UF4-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
; VF1UF4-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16
-; VF1UF4-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF1UF4-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]]
; VF1UF4: middle.block:
; VF1UF4-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
; VF1UF4: scalar.ph:
; VF1UF4-NEXT: store i32 13, i32* [[ARRAYIDX]], align 1
; VF1UF4-NEXT: [[RIVPLUS1]] = add nuw nsw i32 [[RIV]], 1
; VF1UF4-NEXT: [[COND:%.*]] = icmp eq i32 [[RIVPLUS1]], 14
-; VF1UF4-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
+; VF1UF4-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]]
; VF1UF4: exit:
; VF1UF4-NEXT: ret void
;
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ]
-; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], [[INC]]
-; CHECK-NEXT: [[TMP3:%.*]] = trunc i64 [[OFFSET_IDX]] to i8
-; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[INC]] to i8
-; CHECK-NEXT: [[TMP5:%.*]] = mul i8 0, [[TMP4]]
-; CHECK-NEXT: [[TMP6:%.*]] = add i8 [[TMP3]], [[TMP5]]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i64> poison, i64 [[INDEX]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT1]], <2 x i64> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[VEC_IV:%.*]] = add <2 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1>
-; CHECK-NEXT: [[TMP7:%.*]] = icmp ule <2 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0
-; CHECK-NEXT: br i1 [[TMP8]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE12:%.*]] ]
+; CHECK-NEXT: [[OFFSET_IDX3:%.*]] = mul i64 [[INDEX]], [[INC]]
+; CHECK-NEXT: [[TMP6:%.*]] = trunc i64 [[OFFSET_IDX3]] to i8
+; CHECK-NEXT: [[TMP7:%.*]] = trunc i64 [[INC]] to i8
+; CHECK-NEXT: [[TMP9:%.*]] = mul i8 0, [[TMP7]]
+; CHECK-NEXT: [[TMP10:%.*]] = add i8 [[TMP6]], [[TMP9]]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <2 x i64> poison, i64 [[INDEX]], i32 0
+; CHECK-NEXT: [[BROADCAST_SPLAT10:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT9]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[VEC_IV:%.*]] = add <2 x i64> [[BROADCAST_SPLAT10]], <i64 0, i64 1>
+; CHECK-NEXT: [[TMP11:%.*]] = icmp ule <2 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[TMP11]], i32 0
+; CHECK-NEXT: br i1 [[TMP12]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
; CHECK: pred.store.if:
; CHECK-NEXT: store i32 0, i32* [[PTR:%.*]], align 4
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]]
; CHECK: pred.store.continue:
-; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1
-; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4]]
+; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP11]], i32 1
+; CHECK-NEXT: br i1 [[TMP13]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12]]
; CHECK: pred.store.if3:
; CHECK-NEXT: store i32 0, i32* [[PTR]], align 4
-; CHECK-NEXT: br label [[PRED_STORE_CONTINUE4]]
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE12]]
; CHECK: pred.store.continue4:
-; CHECK-NEXT: [[TMP10:%.*]] = add i8 [[TMP6]], 1
+; CHECK-NEXT: [[TMP14:%.*]] = add i8 [[TMP10]], 1
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br i1 true, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: store i8 10, i8* [[TMP0]], align 1
; CHECK-NEXT: store i8 10, i8* [[TMP0]], align 1
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[INDEX_NEXT]], 500
-; CHECK-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 500
+; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 500, 500
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK: vector.ph:
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE2:%.*]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[PRED_LOAD_CONTINUE2]] ]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE4:%.*]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i16> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[PRED_LOAD_CONTINUE4]] ]
; CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[INDEX]] to i16
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i16 41, [[TMP0]]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> poison, i32 [[INDEX]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT: [[VEC_IV:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1>
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i32> poison, i32 [[INDEX]], i32 0
+; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT1]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[VEC_IV:%.*]] = add <2 x i32> [[BROADCAST_SPLAT2]], <i32 0, i32 1>
; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <2 x i32> [[VEC_IV]], <i32 40, i32 40>
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
; CHECK-NEXT: br i1 [[TMP2]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
; CHECK-NEXT: [[TMP11:%.*]] = phi <2 x i16> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_LOAD_IF]] ]
; CHECK-NEXT: [[TMP12:%.*]] = phi <2 x i16> [ poison, [[VECTOR_BODY]] ], [ [[TMP10]], [[PRED_LOAD_IF]] ]
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
-; CHECK-NEXT: br i1 [[TMP13]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2]]
+; CHECK-NEXT: br i1 [[TMP13]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4]]
; CHECK: pred.load.if1:
; CHECK-NEXT: [[TMP14:%.*]] = add i16 [[OFFSET_IDX]], -1
; CHECK-NEXT: [[TMP15:%.*]] = add nsw i16 [[TMP14]], -1
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds [40 x [4 x i16]], [40 x [4 x i16]]* @A, i16 0, i16 [[TMP15]], i16 3
; CHECK-NEXT: [[TMP20:%.*]] = load i16, i16* [[TMP19]], align 1
; CHECK-NEXT: [[TMP21:%.*]] = insertelement <2 x i16> [[TMP12]], i16 [[TMP20]], i32 1
-; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE2]]
+; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE4]]
; CHECK: pred.load.continue2:
-; CHECK-NEXT: [[TMP22:%.*]] = phi <2 x i16> [ [[TMP11]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP18]], [[PRED_LOAD_IF1]] ]
-; CHECK-NEXT: [[TMP23:%.*]] = phi <2 x i16> [ [[TMP12]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP21]], [[PRED_LOAD_IF1]] ]
+; CHECK-NEXT: [[TMP22:%.*]] = phi <2 x i16> [ [[TMP11]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP18]], [[PRED_LOAD_IF3]] ]
+; CHECK-NEXT: [[TMP23:%.*]] = phi <2 x i16> [ [[TMP12]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP21]], [[PRED_LOAD_IF3]] ]
; CHECK-NEXT: [[TMP24:%.*]] = add nsw <2 x i16> [[TMP22]], [[TMP23]]
; CHECK-NEXT: [[TMP25]] = add <2 x i16> [[VEC_PHI]], [[TMP24]]
; CHECK-NEXT: [[TMP26:%.*]] = select <2 x i1> [[TMP1]], <2 x i16> [[TMP25]], <2 x i16> [[VEC_PHI]]
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -prefer-inloop-reductions -dce -instcombine -S | FileCheck %s
define float @cond_fadd(float* noalias nocapture readonly %a, float* noalias nocapture readonly %cond, i64 %N){
; CHECK-NEXT: [[TMP51]] = add i32 [[TMP50]], [[TMP28]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT: br i1 [[TMP52]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
; CHECK-NEXT: [[RES]] = phi i32 [ [[ADD3]], [[IF_THEN]] ], [ [[RDX]], [[FOR_BODY]] ]
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
; CHECK: for.end:
; CHECK-NEXT: [[RES_LCSSA:%.*]] = phi i32 [ [[RES]], [[FOR_INC]] ], [ [[TMP51]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: ret i32 [[RES_LCSSA]]
;
define i64 @nested_cond_and(i64* noalias nocapture readonly %a, i64* noalias nocapture readonly %b, i64* noalias nocapture readonly %cond, i64 %N){
; CHECK-LABEL: @nested_cond_and(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -4
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE14:%.*]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ <i64 5, i64 -1, i64 -1, i64 -1>, [[VECTOR_PH]] ], [ [[PREDPHI15:%.*]], [[PRED_LOAD_CONTINUE14]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = or i64 [[INDEX]], 1
-; CHECK-NEXT: [[TMP1:%.*]] = or i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[INDEX]], 3
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, i64* [[COND:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast i64* [[TMP3]] to <4 x i64>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, <4 x i64>* [[TMP4]], align 4
-; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i64> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i64 0
-; CHECK-NEXT: br i1 [[TMP6]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; CHECK: pred.load.if:
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, i64* [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP8:%.*]] = load i64, i64* [[TMP7]], align 4
-; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i64> poison, i64 [[TMP8]], i64 0
-; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]]
-; CHECK: pred.load.continue:
-; CHECK-NEXT: [[TMP10:%.*]] = phi <4 x i64> [ poison, [[VECTOR_BODY]] ], [ [[TMP9]], [[PRED_LOAD_IF]] ]
-; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i1> [[TMP5]], i64 1
-; CHECK-NEXT: br i1 [[TMP11]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]]
-; CHECK: pred.load.if1:
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP13:%.*]] = load i64, i64* [[TMP12]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i64> [[TMP10]], i64 [[TMP13]], i64 1
-; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE2]]
-; CHECK: pred.load.continue2:
-; CHECK-NEXT: [[TMP15:%.*]] = phi <4 x i64> [ [[TMP10]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF1]] ]
-; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i1> [[TMP5]], i64 2
-; CHECK-NEXT: br i1 [[TMP16]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
-; CHECK: pred.load.if3:
-; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP18:%.*]] = load i64, i64* [[TMP17]], align 4
-; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i64> [[TMP15]], i64 [[TMP18]], i64 2
-; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE4]]
-; CHECK: pred.load.continue4:
-; CHECK-NEXT: [[TMP20:%.*]] = phi <4 x i64> [ [[TMP15]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP19]], [[PRED_LOAD_IF3]] ]
-; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i1> [[TMP5]], i64 3
-; CHECK-NEXT: br i1 [[TMP21]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]]
-; CHECK: pred.load.if5:
-; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP2]]
-; CHECK-NEXT: [[TMP23:%.*]] = load i64, i64* [[TMP22]], align 4
-; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x i64> [[TMP20]], i64 [[TMP23]], i64 3
-; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE6]]
-; CHECK: pred.load.continue6:
-; CHECK-NEXT: [[TMP25:%.*]] = phi <4 x i64> [ [[TMP20]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP24]], [[PRED_LOAD_IF5]] ]
-; CHECK-NEXT: [[TMP26:%.*]] = icmp eq <4 x i64> [[TMP25]], <i64 3, i64 3, i64 3, i64 3>
-; CHECK-NEXT: [[TMP27:%.*]] = select <4 x i1> [[TMP5]], <4 x i1> [[TMP26]], <4 x i1> zeroinitializer
-; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i1> [[TMP27]], i64 0
-; CHECK-NEXT: br i1 [[TMP28]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]]
-; CHECK: pred.load.if7:
-; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i64, i64* [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP30:%.*]] = load i64, i64* [[TMP29]], align 4
-; CHECK-NEXT: [[TMP31:%.*]] = insertelement <4 x i64> poison, i64 [[TMP30]], i64 0
-; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE8]]
-; CHECK: pred.load.continue8:
-; CHECK-NEXT: [[TMP32:%.*]] = phi <4 x i64> [ poison, [[PRED_LOAD_CONTINUE6]] ], [ [[TMP31]], [[PRED_LOAD_IF7]] ]
-; CHECK-NEXT: [[TMP33:%.*]] = extractelement <4 x i1> [[TMP27]], i64 1
-; CHECK-NEXT: br i1 [[TMP33]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]]
-; CHECK: pred.load.if9:
-; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds i64, i64* [[B]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP35:%.*]] = load i64, i64* [[TMP34]], align 4
-; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x i64> [[TMP32]], i64 [[TMP35]], i64 1
-; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE10]]
-; CHECK: pred.load.continue10:
-; CHECK-NEXT: [[TMP37:%.*]] = phi <4 x i64> [ [[TMP32]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP36]], [[PRED_LOAD_IF9]] ]
-; CHECK-NEXT: [[TMP38:%.*]] = extractelement <4 x i1> [[TMP27]], i64 2
-; CHECK-NEXT: br i1 [[TMP38]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]]
-; CHECK: pred.load.if11:
-; CHECK-NEXT: [[TMP39:%.*]] = getelementptr inbounds i64, i64* [[B]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP40:%.*]] = load i64, i64* [[TMP39]], align 4
-; CHECK-NEXT: [[TMP41:%.*]] = insertelement <4 x i64> [[TMP37]], i64 [[TMP40]], i64 2
-; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE12]]
-; CHECK: pred.load.continue12:
-; CHECK-NEXT: [[TMP42:%.*]] = phi <4 x i64> [ [[TMP37]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP41]], [[PRED_LOAD_IF11]] ]
-; CHECK-NEXT: [[TMP43:%.*]] = extractelement <4 x i1> [[TMP27]], i64 3
-; CHECK-NEXT: br i1 [[TMP43]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14]]
-; CHECK: pred.load.if13:
-; CHECK-NEXT: [[TMP44:%.*]] = getelementptr inbounds i64, i64* [[B]], i64 [[TMP2]]
-; CHECK-NEXT: [[TMP45:%.*]] = load i64, i64* [[TMP44]], align 4
-; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x i64> [[TMP42]], i64 [[TMP45]], i64 3
-; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE14]]
-; CHECK: pred.load.continue14:
-; CHECK-NEXT: [[TMP47:%.*]] = phi <4 x i64> [ [[TMP42]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP46]], [[PRED_LOAD_IF13]] ]
-; CHECK-NEXT: [[TMP48:%.*]] = xor <4 x i1> [[TMP26]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT: [[TMP49:%.*]] = select <4 x i1> [[TMP5]], <4 x i1> [[TMP48]], <4 x i1> zeroinitializer
-; CHECK-NEXT: [[PREDPHI_V:%.*]] = select <4 x i1> [[TMP49]], <4 x i64> [[TMP25]], <4 x i64> [[TMP47]]
-; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> [[PREDPHI_V]], <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1>
-; CHECK-NEXT: [[PREDPHI15]] = and <4 x i64> [[VEC_PHI]], [[PREDPHI]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP50:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP50]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NOT: @llvm.vector.reduce.and
; CHECK: middle.block:
-; CHECK-NEXT: [[TMP51:%.*]] = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> [[PREDPHI15]])
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP51]], [[MIDDLE_BLOCK]] ], [ 5, [[ENTRY]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; CHECK-NEXT: [[RDX:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RES:%.*]], [[FOR_INC]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[COND]], i64 [[IV]]
-; CHECK-NEXT: [[TMP52:%.*]] = load i64, i64* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i64 [[TMP52]], 0
-; CHECK-NEXT: br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]]
-; CHECK: if.then:
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[IV]]
-; CHECK-NEXT: [[TMP53:%.*]] = load i64, i64* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[AND1:%.*]] = and i64 [[RDX]], [[TMP53]]
-; CHECK-NEXT: [[TOBOOL2:%.*]] = icmp eq i64 [[TMP53]], 3
-; CHECK-NEXT: br i1 [[TOBOOL2]], label [[IF_THEN_2:%.*]], label [[FOR_INC]]
-; CHECK: if.then.2:
-; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[B]], i64 [[IV]]
-; CHECK-NEXT: [[TMP54:%.*]] = load i64, i64* [[ARRAYIDX3]], align 4
-; CHECK-NEXT: [[AND2:%.*]] = and i64 [[RDX]], [[TMP54]]
-; CHECK-NEXT: br label [[FOR_INC]]
-; CHECK: for.inc:
-; CHECK-NEXT: [[RES]] = phi i64 [ [[AND2]], [[IF_THEN_2]] ], [ [[AND1]], [[IF_THEN]] ], [ [[RDX]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: [[RES_LCSSA:%.*]] = phi i64 [ [[RES]], [[FOR_INC]] ], [ [[TMP51]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: ret i64 [[RES_LCSSA]]
-;
+; CHECK: @llvm.vector.reduce.and
+; CHECK: scalar.ph
entry:
br label %for.body
;
define i32 @cond-uncond(i32* noalias %src1, i32* noalias %src2, i32* noalias %cond, i64 noundef %n) #0 {
; CHECK-LABEL: @cond-uncond(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -4
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[PRED_LOAD_CONTINUE6]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[COND:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT: [[TMP3:%.*]] = xor <4 x i1> [[TMP2]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i64 0
-; CHECK-NEXT: br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; CHECK: pred.load.if:
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[SRC2:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[TMP5]], align 4
-; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[TMP6]], i64 0
-; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]]
-; CHECK: pred.load.continue:
-; CHECK-NEXT: [[TMP8:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_LOAD_IF]] ]
-; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i1> [[TMP3]], i64 1
-; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]]
-; CHECK: pred.load.if1:
-; CHECK-NEXT: [[TMP10:%.*]] = or i64 [[INDEX]], 1
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[SRC2]], i64 [[TMP10]]
-; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP11]], align 4
-; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP12]], i64 1
-; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE2]]
-; CHECK: pred.load.continue2:
-; CHECK-NEXT: [[TMP14:%.*]] = phi <4 x i32> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP13]], [[PRED_LOAD_IF1]] ]
-; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i1> [[TMP3]], i64 2
-; CHECK-NEXT: br i1 [[TMP15]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
-; CHECK: pred.load.if3:
-; CHECK-NEXT: [[TMP16:%.*]] = or i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[SRC2]], i64 [[TMP16]]
-; CHECK-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4
-; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP18]], i64 2
-; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE4]]
-; CHECK: pred.load.continue4:
-; CHECK-NEXT: [[TMP20:%.*]] = phi <4 x i32> [ [[TMP14]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP19]], [[PRED_LOAD_IF3]] ]
-; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i1> [[TMP3]], i64 3
-; CHECK-NEXT: br i1 [[TMP21]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6]]
-; CHECK: pred.load.if5:
-; CHECK-NEXT: [[TMP22:%.*]] = or i64 [[INDEX]], 3
-; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[SRC2]], i64 [[TMP22]]
-; CHECK-NEXT: [[TMP24:%.*]] = load i32, i32* [[TMP23]], align 4
-; CHECK-NEXT: [[TMP25:%.*]] = insertelement <4 x i32> [[TMP20]], i32 [[TMP24]], i64 3
-; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE6]]
; CHECK: pred.load.continue6:
-; CHECK-NEXT: [[TMP26:%.*]] = phi <4 x i32> [ [[TMP20]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP25]], [[PRED_LOAD_IF5]] ]
-; CHECK-NEXT: [[TMP27:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> zeroinitializer, <4 x i32> [[TMP26]]
-; CHECK-NEXT: [[PREDPHI:%.*]] = add <4 x i32> [[VEC_PHI]], [[TMP27]]
-; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, i32* [[SRC1:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP29]], align 4
-; CHECK-NEXT: [[TMP30]] = add <4 x i32> [[WIDE_LOAD7]], [[PREDPHI]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NOT: @llvm.vector.reduce.add
; CHECK: middle.block:
-; CHECK-NEXT: [[TMP32:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP30]])
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP32]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[RDX1:%.*]] = phi i32 [ [[ADD2:%.*]], [[IF_END:%.*]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[IF_END]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[COND]], i64 [[IV]]
-; CHECK-NEXT: [[TMP33:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP33]], 0
-; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END]], label [[IF_THEN:%.*]]
-; CHECK: if.then:
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[SRC2]], i64 [[IV]]
-; CHECK-NEXT: [[TMP34:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4
-; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP34]], [[RDX1]]
-; CHECK-NEXT: br label [[IF_END]]
-; CHECK: if.end:
-; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[ADD]], [[IF_THEN]] ], [ [[RDX1]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[SRC1]], i64 [[IV]]
-; CHECK-NEXT: [[TMP35:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[ADD2]] = add nsw i32 [[TMP35]], [[RES]]
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: [[ADD2_LCSSA:%.*]] = phi i32 [ [[ADD2]], [[IF_END]] ], [ [[TMP32]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: ret i32 [[ADD2_LCSSA]]
-;
+; CHECK: @llvm.vector.reduce.add
+; CHECK: scalar.ph
entry:
br label %for.body
;
define float @cond_cond(float* noalias %src1, float* noalias %src2, float* noalias %cond, i64 %n) #0 {
; CHECK-LABEL: @cond_cond(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -4
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE14:%.*]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ <float 2.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, [[VECTOR_PH]] ], [ [[PREDPHI15:%.*]], [[PRED_LOAD_CONTINUE14]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = or i64 [[INDEX]], 1
-; CHECK-NEXT: [[TMP1:%.*]] = or i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[INDEX]], 3
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, float* [[COND:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[TMP3]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP4]], align 4
-; CHECK-NEXT: [[TMP5:%.*]] = fcmp fast oeq <4 x float> [[WIDE_LOAD]], <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>
-; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i64 0
-; CHECK-NEXT: br i1 [[TMP6]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; CHECK: pred.load.if:
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[SRC1:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP8:%.*]] = load float, float* [[TMP7]], align 4
-; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x float> poison, float [[TMP8]], i64 0
-; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]]
-; CHECK: pred.load.continue:
-; CHECK-NEXT: [[TMP10:%.*]] = phi <4 x float> [ poison, [[VECTOR_BODY]] ], [ [[TMP9]], [[PRED_LOAD_IF]] ]
-; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i1> [[TMP5]], i64 1
-; CHECK-NEXT: br i1 [[TMP11]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]]
-; CHECK: pred.load.if1:
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[SRC1]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP13:%.*]] = load float, float* [[TMP12]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP13]], i64 1
-; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE2]]
-; CHECK: pred.load.continue2:
-; CHECK-NEXT: [[TMP15:%.*]] = phi <4 x float> [ [[TMP10]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF1]] ]
-; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i1> [[TMP5]], i64 2
-; CHECK-NEXT: br i1 [[TMP16]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
-; CHECK: pred.load.if3:
-; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, float* [[SRC1]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP18:%.*]] = load float, float* [[TMP17]], align 4
-; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x float> [[TMP15]], float [[TMP18]], i64 2
-; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE4]]
-; CHECK: pred.load.continue4:
-; CHECK-NEXT: [[TMP20:%.*]] = phi <4 x float> [ [[TMP15]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP19]], [[PRED_LOAD_IF3]] ]
-; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x i1> [[TMP5]], i64 3
-; CHECK-NEXT: br i1 [[TMP21]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]]
-; CHECK: pred.load.if5:
-; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[SRC1]], i64 [[TMP2]]
-; CHECK-NEXT: [[TMP23:%.*]] = load float, float* [[TMP22]], align 4
-; CHECK-NEXT: [[TMP24:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP23]], i64 3
-; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE6]]
-; CHECK: pred.load.continue6:
-; CHECK-NEXT: [[TMP25:%.*]] = phi <4 x float> [ [[TMP20]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP24]], [[PRED_LOAD_IF5]] ]
-; CHECK-NEXT: [[TMP26:%.*]] = select <4 x i1> [[TMP5]], <4 x float> [[TMP25]], <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>
-; CHECK-NEXT: [[PREDPHI:%.*]] = fadd fast <4 x float> [[VEC_PHI]], [[TMP26]]
-; CHECK-NEXT: [[TMP27:%.*]] = fcmp fast oeq <4 x float> [[WIDE_LOAD]], <float 7.000000e+00, float 7.000000e+00, float 7.000000e+00, float 7.000000e+00>
-; CHECK-NEXT: [[TMP28:%.*]] = extractelement <4 x i1> [[TMP27]], i64 0
-; CHECK-NEXT: br i1 [[TMP28]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8:%.*]]
-; CHECK: pred.load.if7:
-; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float* [[SRC2:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP30:%.*]] = load float, float* [[TMP29]], align 4
-; CHECK-NEXT: [[TMP31:%.*]] = insertelement <4 x float> poison, float [[TMP30]], i64 0
-; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE8]]
-; CHECK: pred.load.continue8:
-; CHECK-NEXT: [[TMP32:%.*]] = phi <4 x float> [ poison, [[PRED_LOAD_CONTINUE6]] ], [ [[TMP31]], [[PRED_LOAD_IF7]] ]
-; CHECK-NEXT: [[TMP33:%.*]] = extractelement <4 x i1> [[TMP27]], i64 1
-; CHECK-NEXT: br i1 [[TMP33]], label [[PRED_LOAD_IF9:%.*]], label [[PRED_LOAD_CONTINUE10:%.*]]
-; CHECK: pred.load.if9:
-; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float* [[SRC2]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP35:%.*]] = load float, float* [[TMP34]], align 4
-; CHECK-NEXT: [[TMP36:%.*]] = insertelement <4 x float> [[TMP32]], float [[TMP35]], i64 1
-; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE10]]
-; CHECK: pred.load.continue10:
-; CHECK-NEXT: [[TMP37:%.*]] = phi <4 x float> [ [[TMP32]], [[PRED_LOAD_CONTINUE8]] ], [ [[TMP36]], [[PRED_LOAD_IF9]] ]
-; CHECK-NEXT: [[TMP38:%.*]] = extractelement <4 x i1> [[TMP27]], i64 2
-; CHECK-NEXT: br i1 [[TMP38]], label [[PRED_LOAD_IF11:%.*]], label [[PRED_LOAD_CONTINUE12:%.*]]
-; CHECK: pred.load.if11:
-; CHECK-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, float* [[SRC2]], i64 [[TMP1]]
-; CHECK-NEXT: [[TMP40:%.*]] = load float, float* [[TMP39]], align 4
-; CHECK-NEXT: [[TMP41:%.*]] = insertelement <4 x float> [[TMP37]], float [[TMP40]], i64 2
-; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE12]]
-; CHECK: pred.load.continue12:
-; CHECK-NEXT: [[TMP42:%.*]] = phi <4 x float> [ [[TMP37]], [[PRED_LOAD_CONTINUE10]] ], [ [[TMP41]], [[PRED_LOAD_IF11]] ]
-; CHECK-NEXT: [[TMP43:%.*]] = extractelement <4 x i1> [[TMP27]], i64 3
-; CHECK-NEXT: br i1 [[TMP43]], label [[PRED_LOAD_IF13:%.*]], label [[PRED_LOAD_CONTINUE14]]
-; CHECK: pred.load.if13:
-; CHECK-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, float* [[SRC2]], i64 [[TMP2]]
-; CHECK-NEXT: [[TMP45:%.*]] = load float, float* [[TMP44]], align 4
-; CHECK-NEXT: [[TMP46:%.*]] = insertelement <4 x float> [[TMP42]], float [[TMP45]], i64 3
-; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE14]]
; CHECK: pred.load.continue14:
-; CHECK-NEXT: [[TMP47:%.*]] = phi <4 x float> [ [[TMP42]], [[PRED_LOAD_CONTINUE12]] ], [ [[TMP46]], [[PRED_LOAD_IF13]] ]
-; CHECK-NEXT: [[TMP48:%.*]] = select <4 x i1> [[TMP27]], <4 x float> [[TMP47]], <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>
-; CHECK-NEXT: [[PREDPHI15]] = fadd fast <4 x float> [[PREDPHI]], [[TMP48]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP49:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP49]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NOT: @llvm.vector.reduce.fadd
; CHECK: middle.block:
-; CHECK-NEXT: [[TMP50:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[PREDPHI15]])
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP50]], [[MIDDLE_BLOCK]] ], [ 2.000000e+00, [[ENTRY]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[RDX1:%.*]] = phi float [ [[RES:%.*]], [[FOR_INC:%.*]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_INC]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[COND]], i64 [[IV]]
-; CHECK-NEXT: [[TMP51:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[CMP1:%.*]] = fcmp fast oeq float [[TMP51]], 3.000000e+00
-; CHECK-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
-; CHECK: if.then:
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[SRC1]], i64 [[IV]]
-; CHECK-NEXT: [[TMP52:%.*]] = load float, float* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP52]], [[RDX1]]
-; CHECK-NEXT: br label [[IF_END]]
-; CHECK: if.end:
-; CHECK-NEXT: [[RDX2:%.*]] = phi float [ [[ADD]], [[IF_THEN]] ], [ [[RDX1]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[CMP5:%.*]] = fcmp fast oeq float [[TMP51]], 7.000000e+00
-; CHECK-NEXT: br i1 [[CMP5]], label [[IF_THEN6:%.*]], label [[FOR_INC]]
-; CHECK: if.then6:
-; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[SRC2]], i64 [[IV]]
-; CHECK-NEXT: [[TMP53:%.*]] = load float, float* [[ARRAYIDX7]], align 4
-; CHECK-NEXT: [[ADD2:%.*]] = fadd fast float [[TMP53]], [[RDX2]]
-; CHECK-NEXT: br label [[FOR_INC]]
-; CHECK: for.inc:
-; CHECK-NEXT: [[RES]] = phi float [ [[ADD2]], [[IF_THEN6]] ], [ [[RDX2]], [[IF_END]] ]
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: [[RES_LCSSA:%.*]] = phi float [ [[RES]], [[FOR_INC]] ], [ [[TMP50]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: ret float [[RES_LCSSA]]
-;
+; CHECK: @llvm.vector.reduce.fadd
+; CHECK: scalar.ph
entry:
br label %for.body
;
define i32 @uncond_cond(i32* noalias %src1, i32* noalias %src2, i32* noalias %cond, i64 %N) #0 {
; CHECK-LABEL: @uncond_cond(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -4
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE7:%.*]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[PRED_LOAD_CONTINUE7]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[SRC1:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[COND:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
-; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD1]], zeroinitializer
-; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP6]], i64 0
-; CHECK-NEXT: br i1 [[TMP7]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; CHECK: pred.load.if:
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[SRC2:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4
-; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP9]], i64 0
-; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]]
-; CHECK: pred.load.continue:
-; CHECK-NEXT: [[TMP11:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP10]], [[PRED_LOAD_IF]] ]
-; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i1> [[TMP6]], i64 1
-; CHECK-NEXT: br i1 [[TMP12]], label [[PRED_LOAD_IF2:%.*]], label [[PRED_LOAD_CONTINUE3:%.*]]
-; CHECK: pred.load.if2:
-; CHECK-NEXT: [[TMP13:%.*]] = or i64 [[INDEX]], 1
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[SRC2]], i64 [[TMP13]]
-; CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4
-; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP15]], i64 1
-; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE3]]
-; CHECK: pred.load.continue3:
-; CHECK-NEXT: [[TMP17:%.*]] = phi <4 x i32> [ [[TMP11]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP16]], [[PRED_LOAD_IF2]] ]
-; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x i1> [[TMP6]], i64 2
-; CHECK-NEXT: br i1 [[TMP18]], label [[PRED_LOAD_IF4:%.*]], label [[PRED_LOAD_CONTINUE5:%.*]]
-; CHECK: pred.load.if4:
-; CHECK-NEXT: [[TMP19:%.*]] = or i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[SRC2]], i64 [[TMP19]]
-; CHECK-NEXT: [[TMP21:%.*]] = load i32, i32* [[TMP20]], align 4
-; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP21]], i64 2
-; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE5]]
-; CHECK: pred.load.continue5:
-; CHECK-NEXT: [[TMP23:%.*]] = phi <4 x i32> [ [[TMP17]], [[PRED_LOAD_CONTINUE3]] ], [ [[TMP22]], [[PRED_LOAD_IF4]] ]
-; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i1> [[TMP6]], i64 3
-; CHECK-NEXT: br i1 [[TMP24]], label [[PRED_LOAD_IF6:%.*]], label [[PRED_LOAD_CONTINUE7]]
-; CHECK: pred.load.if6:
-; CHECK-NEXT: [[TMP25:%.*]] = or i64 [[INDEX]], 3
-; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[SRC2]], i64 [[TMP25]]
-; CHECK-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4
-; CHECK-NEXT: [[TMP28:%.*]] = insertelement <4 x i32> [[TMP23]], i32 [[TMP27]], i64 3
-; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE7]]
; CHECK: pred.load.continue7:
-; CHECK-NEXT: [[TMP29:%.*]] = phi <4 x i32> [ [[TMP23]], [[PRED_LOAD_CONTINUE5]] ], [ [[TMP28]], [[PRED_LOAD_IF6]] ]
-; CHECK-NEXT: [[TMP30:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> zeroinitializer, <4 x i32> [[TMP29]]
-; CHECK-NEXT: [[PREDPHI]] = add <4 x i32> [[TMP2]], [[TMP30]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NOT: @llvm.vector.reduce.add
; CHECK: middle.block:
-; CHECK-NEXT: [[TMP32:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PREDPHI]])
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP32]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[RDX:%.*]] = phi i32 [ [[RES:%.*]], [[FOR_INC:%.*]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_INC]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[SRC1]], i64 [[IV]]
-; CHECK-NEXT: [[TMP33:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP33]], [[RDX]]
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[COND]], i64 [[IV]]
-; CHECK-NEXT: [[TMP34:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4
-; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP34]], 0
-; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
-; CHECK: if.then:
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[SRC2]], i64 [[IV]]
-; CHECK-NEXT: [[TMP35:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP35]], [[ADD1]]
-; CHECK-NEXT: br label [[FOR_INC]]
-; CHECK: for.inc:
-; CHECK-NEXT: [[RES]] = phi i32 [ [[ADD2]], [[IF_THEN]] ], [ [[ADD1]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: [[RES_LCSSA:%.*]] = phi i32 [ [[RES]], [[FOR_INC]] ], [ [[TMP32]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: ret i32 [[RES_LCSSA]]
-;
+; CHECK: @llvm.vector.reduce.add
+; CHECK: scalar.ph
entry:
br label %for.body
;
define i32 @uncond_cond_uncond(i32* noalias %src1, i32* noalias %src2, i32* noalias %cond, i64 noundef %N) {
; CHECK-LABEL: @uncond_cond_uncond(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 4
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], -4
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE7:%.*]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP31:%.*]], [[PRED_LOAD_CONTINUE7]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[SRC1:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[COND:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
-; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD1]], zeroinitializer
-; CHECK-NEXT: [[TMP6:%.*]] = xor <4 x i1> [[TMP5]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP6]], i64 0
-; CHECK-NEXT: br i1 [[TMP7]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; CHECK: pred.load.if:
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[SRC2:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[TMP8]], align 4
-; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP9]], i64 0
-; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]]
-; CHECK: pred.load.continue:
-; CHECK-NEXT: [[TMP11:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP10]], [[PRED_LOAD_IF]] ]
-; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i1> [[TMP6]], i64 1
-; CHECK-NEXT: br i1 [[TMP12]], label [[PRED_LOAD_IF2:%.*]], label [[PRED_LOAD_CONTINUE3:%.*]]
-; CHECK: pred.load.if2:
-; CHECK-NEXT: [[TMP13:%.*]] = or i64 [[INDEX]], 1
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[SRC2]], i64 [[TMP13]]
-; CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4
-; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP15]], i64 1
-; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE3]]
-; CHECK: pred.load.continue3:
-; CHECK-NEXT: [[TMP17:%.*]] = phi <4 x i32> [ [[TMP11]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP16]], [[PRED_LOAD_IF2]] ]
-; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x i1> [[TMP6]], i64 2
-; CHECK-NEXT: br i1 [[TMP18]], label [[PRED_LOAD_IF4:%.*]], label [[PRED_LOAD_CONTINUE5:%.*]]
-; CHECK: pred.load.if4:
-; CHECK-NEXT: [[TMP19:%.*]] = or i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[SRC2]], i64 [[TMP19]]
-; CHECK-NEXT: [[TMP21:%.*]] = load i32, i32* [[TMP20]], align 4
-; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP21]], i64 2
-; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE5]]
-; CHECK: pred.load.continue5:
-; CHECK-NEXT: [[TMP23:%.*]] = phi <4 x i32> [ [[TMP17]], [[PRED_LOAD_CONTINUE3]] ], [ [[TMP22]], [[PRED_LOAD_IF4]] ]
-; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x i1> [[TMP6]], i64 3
-; CHECK-NEXT: br i1 [[TMP24]], label [[PRED_LOAD_IF6:%.*]], label [[PRED_LOAD_CONTINUE7]]
-; CHECK: pred.load.if6:
-; CHECK-NEXT: [[TMP25:%.*]] = or i64 [[INDEX]], 3
-; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[SRC2]], i64 [[TMP25]]
-; CHECK-NEXT: [[TMP27:%.*]] = load i32, i32* [[TMP26]], align 4
-; CHECK-NEXT: [[TMP28:%.*]] = insertelement <4 x i32> [[TMP23]], i32 [[TMP27]], i64 3
-; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE7]]
; CHECK: pred.load.continue7:
-; CHECK-NEXT: [[TMP29:%.*]] = phi <4 x i32> [ [[TMP23]], [[PRED_LOAD_CONTINUE5]] ], [ [[TMP28]], [[PRED_LOAD_IF6]] ]
-; CHECK-NEXT: [[TMP30:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> zeroinitializer, <4 x i32> [[TMP29]]
-; CHECK-NEXT: [[PREDPHI:%.*]] = add <4 x i32> [[TMP2]], [[TMP30]]
-; CHECK-NEXT: [[TMP31]] = add <4 x i32> [[PREDPHI]], [[WIDE_LOAD]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-NOT: @llvm.vector.reduce.add
; CHECK: middle.block:
-; CHECK-NEXT: [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP31]])
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP33]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[RDX:%.*]] = phi i32 [ [[ADD3:%.*]], [[IF_END:%.*]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[IF_END]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[SRC1]], i64 [[IV]]
-; CHECK-NEXT: [[TMP34:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[ADD1:%.*]] = add nsw i32 [[TMP34]], [[RDX]]
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[COND]], i64 [[IV]]
-; CHECK-NEXT: [[TMP35:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4
-; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP35]], 0
-; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END]], label [[IF_THEN:%.*]]
-; CHECK: if.then:
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[SRC2]], i64 [[IV]]
-; CHECK-NEXT: [[TMP36:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[ADD2:%.*]] = add nsw i32 [[TMP36]], [[ADD1]]
-; CHECK-NEXT: br label [[IF_END]]
-; CHECK: if.end:
-; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[ADD2]], [[IF_THEN]] ], [ [[ADD1]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ADD3]] = add nsw i32 [[RES]], [[TMP34]]
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: [[ADD3_LCSSA:%.*]] = phi i32 [ [[ADD3]], [[IF_END]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: ret i32 [[ADD3_LCSSA]]
-;
+; CHECK: @llvm.vector.reduce.add
+; CHECK: scalar.ph
entry:
br label %for.body
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND1:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[VEC_IND]])
+; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[VEC_IND1]])
; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP4]], [[VEC_PHI]]
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD]])
; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP6]], [[TMP5]]
-; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD1]])
+; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD3]])
; CHECK-NEXT: [[TMP9]] = add i32 [[TMP8]], [[TMP7]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT: [[VEC_IND_NEXT2]] = add <4 x i32> [[VEC_IND1]], <i32 4, i32 4, i32 4, i32 4>
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: middle.block:
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 1, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND1:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[VEC_IND]])
+; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[VEC_IND1]])
; CHECK-NEXT: [[TMP5:%.*]] = mul i32 [[TMP4]], [[VEC_PHI]]
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[WIDE_LOAD]])
; CHECK-NEXT: [[TMP7:%.*]] = mul i32 [[TMP6]], [[TMP5]]
-; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[WIDE_LOAD1]])
+; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[WIDE_LOAD3]])
; CHECK-NEXT: [[TMP9]] = mul i32 [[TMP8]], [[TMP7]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT: [[VEC_IND_NEXT2]] = add <4 x i32> [[VEC_IND1]], <i32 4, i32 4, i32 4, i32 4>
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; CHECK: middle.block:
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND1:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[VEC_IND]])
+; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD3]], [[WIDE_LOAD]]
+; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[VEC_IND1]])
; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], [[VEC_PHI]]
; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
; CHECK-NEXT: [[TMP8]] = add i32 [[TMP7]], [[TMP6]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT: [[VEC_IND_NEXT2]] = add <4 x i32> [[VEC_IND1]], <i32 4, i32 4, i32 4, i32 4>
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
; CHECK: middle.block:
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND1:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
-; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[VEC_IND]])
+; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
+; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[VEC_IND1]])
; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP4]], [[VEC_PHI]]
; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD]])
; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP6]], [[TMP5]]
-; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD1]])
+; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD3]])
; CHECK-NEXT: [[TMP9]] = add i32 [[TMP8]], [[TMP7]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT: [[VEC_IND_NEXT2]] = add <4 x i32> [[VEC_IND1]], <i32 4, i32 4, i32 4, i32 4>
; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP34:![0-9]+]]
; CHECK: middle.block:
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt %s -loop-vectorize -force-vector-interleave=3 -force-vector-width=4 -S | FileCheck --check-prefix=UF3 %s
; RUN: opt %s -loop-vectorize -force-vector-interleave=5 -force-vector-width=4 -S | FileCheck --check-prefix=UF5 %s
define i32 @reduction_sum(i64 %n, i32* noalias nocapture %A) {
-; UF3-LABEL: @reduction_sum(
-; UF3-NEXT: entry:
-; UF3-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1
-; UF3-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 12
-; UF3-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; UF3: vector.ph:
-; UF3-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 12
-; UF3-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
-; UF3-NEXT: br label [[VECTOR_BODY:%.*]]
-; UF3: vector.body:
-; UF3-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UF3-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
-; UF3-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
-; UF3-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
-; UF3-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0
-; UF3-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 4
-; UF3-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 8
-; UF3-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP1]]
-; UF3-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP2]]
-; UF3-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP3]]
-; UF3-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
-; UF3-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
-; UF3-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 4
-; UF3-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 4
-; UF3-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
-; UF3-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4
-; UF3-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 8
-; UF3-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
-; UF3-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP12]], align 4
-; UF3-NEXT: [[TMP13]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
-; UF3-NEXT: [[TMP14]] = add <4 x i32> [[VEC_PHI1]], [[WIDE_LOAD3]]
-; UF3-NEXT: [[TMP15]] = add <4 x i32> [[VEC_PHI2]], [[WIDE_LOAD4]]
-; UF3-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 12
-; UF3-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; UF3-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; UF3: middle.block:
-; UF3-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP14]], [[TMP13]]
-; UF3-NEXT: [[BIN_RDX5:%.*]] = add <4 x i32> [[TMP15]], [[BIN_RDX]]
-; UF3-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX5]])
-; UF3-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
-; UF3-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; UF3: scalar.ph:
-; UF3-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UF3-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
-; UF3-NEXT: br label [[LOOP:%.*]]
-; UF3: loop:
-; UF3-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; UF3-NEXT: [[SUM_02:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUM_NEXT:%.*]], [[LOOP]] ]
-; UF3-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
-; UF3-NEXT: [[LV_A:%.*]] = load i32, i32* [[GEP_A]], align 4
-; UF3-NEXT: [[SUM_NEXT]] = add i32 [[SUM_02]], [[LV_A]]
-; UF3-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; UF3-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], [[N]]
-; UF3-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
-; UF3: exit:
-; UF3-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ [[SUM_NEXT]], [[LOOP]] ], [ [[TMP17]], [[MIDDLE_BLOCK]] ]
-; UF3-NEXT: ret i32 [[SUM_0_LCSSA]]
+; UF3-LABEL: vector.body:
+; UF3-NEXT: [[IV:%.+]] = phi i64 [ 0, %vector.ph ], [ [[IV_NEXT:%.+]], %vector.body ]
+; UF3-NEXT: [[SUM0:%.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ [[SUM0_NEXT:%.+]], %vector.body ]
+; UF3-NEXT: [[SUM1:%.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ [[SUM1_NEXT:%.+]], %vector.body ]
+; UF3-NEXT: [[SUM2:%.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ [[SUM2_NEXT:%.+]], %vector.body ]
+; UF3-NEXT: [[IV0:%.+]] = add i64 [[IV]], 0
+; UF3-NEXT: [[IV1:%.+]] = add i64 [[IV]], 4
+; UF3-NEXT: [[IV2:%.+]] = add i64 [[IV]], 8
+; UF3-NEXT: [[GEP0:%.+]] = getelementptr inbounds i32, i32* %A, i64 [[IV0]]
+; UF3-NEXT: [[GEP1:%.+]] = getelementptr inbounds i32, i32* %A, i64 [[IV1]]
+; UF3-NEXT: [[GEP2:%.+]] = getelementptr inbounds i32, i32* %A, i64 [[IV2]]
+; UF3-NEXT: [[L_GEP0:%.+]] = getelementptr inbounds i32, i32* [[GEP0]], i32 0
+; UF3-NEXT: [[BC0:%.+]] = bitcast i32* [[L_GEP0]] to <4 x i32>*
+; UF3-NEXT: [[L0:%.+]] = load <4 x i32>, <4 x i32>* [[BC0]], align 4
+; UF3-NEXT: [[L_GEP1:%.+]] = getelementptr inbounds i32, i32* [[GEP0]], i32 4
+; UF3-NEXT: [[BC1:%.+]] = bitcast i32* [[L_GEP1]] to <4 x i32>*
+; UF3-NEXT: [[L1:%.+]] = load <4 x i32>, <4 x i32>* [[BC1]], align 4
+; UF3-NEXT: [[L_GEP2:%.+]] = getelementptr inbounds i32, i32* [[GEP0]], i32 8
+; UF3-NEXT: [[BC2:%.+]] = bitcast i32* [[L_GEP2]] to <4 x i32>*
+; UF3-NEXT: [[L2:%.+]] = load <4 x i32>, <4 x i32>* [[BC2]], align 4
+; UF3-NEXT: [[SUM0_NEXT]] = add <4 x i32> [[SUM0]], [[L0]]
+; UF3-NEXT: [[SUM1_NEXT]] = add <4 x i32> [[SUM1]], [[L1]]
+; UF3-NEXT: [[SUM2_NEXT]] = add <4 x i32> [[SUM2]], [[L2]]
+; UF3-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 12
+; UF3-NEXT: [[EC:%.+]] = icmp eq i64 [[IV_NEXT]], %n.vec
+; UF3-NEXT: br i1 [[EC]], label %middle.block, label %vector.body
;
-; UF5-LABEL: @reduction_sum(
-; UF5-NEXT: entry:
-; UF5-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1
-; UF5-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 20
-; UF5-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; UF5: vector.ph:
-; UF5-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 20
-; UF5-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
-; UF5-NEXT: br label [[VECTOR_BODY:%.*]]
-; UF5: vector.body:
-; UF5-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; UF5-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
-; UF5-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ]
-; UF5-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
-; UF5-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
-; UF5-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
-; UF5-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0
-; UF5-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 4
-; UF5-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 8
-; UF5-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 12
-; UF5-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 16
-; UF5-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP1]]
-; UF5-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP2]]
-; UF5-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP3]]
-; UF5-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP4]]
-; UF5-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP5]]
-; UF5-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 0
-; UF5-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
-; UF5-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP12]], align 4
-; UF5-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 4
-; UF5-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <4 x i32>*
-; UF5-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP14]], align 4
-; UF5-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 8
-; UF5-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP15]] to <4 x i32>*
-; UF5-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP16]], align 4
-; UF5-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 12
-; UF5-NEXT: [[TMP18:%.*]] = bitcast i32* [[TMP17]] to <4 x i32>*
-; UF5-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP18]], align 4
-; UF5-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[TMP6]], i32 16
-; UF5-NEXT: [[TMP20:%.*]] = bitcast i32* [[TMP19]] to <4 x i32>*
-; UF5-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP20]], align 4
-; UF5-NEXT: [[TMP21]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
-; UF5-NEXT: [[TMP22]] = add <4 x i32> [[VEC_PHI1]], [[WIDE_LOAD5]]
-; UF5-NEXT: [[TMP23]] = add <4 x i32> [[VEC_PHI2]], [[WIDE_LOAD6]]
-; UF5-NEXT: [[TMP24]] = add <4 x i32> [[VEC_PHI3]], [[WIDE_LOAD7]]
-; UF5-NEXT: [[TMP25]] = add <4 x i32> [[VEC_PHI4]], [[WIDE_LOAD8]]
-; UF5-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 20
-; UF5-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; UF5-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; UF5: middle.block:
-; UF5-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[TMP22]], [[TMP21]]
-; UF5-NEXT: [[BIN_RDX9:%.*]] = add <4 x i32> [[TMP23]], [[BIN_RDX]]
-; UF5-NEXT: [[BIN_RDX10:%.*]] = add <4 x i32> [[TMP24]], [[BIN_RDX9]]
-; UF5-NEXT: [[BIN_RDX11:%.*]] = add <4 x i32> [[TMP25]], [[BIN_RDX10]]
-; UF5-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX11]])
-; UF5-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
-; UF5-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; UF5: scalar.ph:
-; UF5-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; UF5-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
-; UF5-NEXT: br label [[LOOP:%.*]]
-; UF5: loop:
-; UF5-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; UF5-NEXT: [[SUM_02:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUM_NEXT:%.*]], [[LOOP]] ]
-; UF5-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[IV]]
-; UF5-NEXT: [[LV_A:%.*]] = load i32, i32* [[GEP_A]], align 4
-; UF5-NEXT: [[SUM_NEXT]] = add i32 [[SUM_02]], [[LV_A]]
-; UF5-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
-; UF5-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], [[N]]
-; UF5-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
-; UF5: exit:
-; UF5-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ [[SUM_NEXT]], [[LOOP]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ]
-; UF5-NEXT: ret i32 [[SUM_0_LCSSA]]
+; UF3-LABEL: middle.block:
+; UF3-NEXT: [[RDX0:%.+]] = add <4 x i32> [[SUM1_NEXT]], [[SUM0_NEXT]]
+; UF3-NEXT: [[RDX1:%.+]] = add <4 x i32> [[SUM2_NEXT]], [[RDX0]]
+; UF3-NEXT: call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[RDX1]])
;
+; UF5-LABEL: vector.body:
+; UF5-NEXT: [[IV:%.+]] = phi i64 [ 0, %vector.ph ], [ [[IV_NEXT:%.+]], %vector.body ]
+; UF5-NEXT: [[SUM0:%.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ [[SUM0_NEXT:%.+]], %vector.body ]
+; UF5-NEXT: [[SUM1:%.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ [[SUM1_NEXT:%.+]], %vector.body ]
+; UF5-NEXT: [[SUM2:%.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ [[SUM2_NEXT:%.+]], %vector.body ]
+; UF5-NEXT: [[SUM3:%.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ [[SUM3_NEXT:%.+]], %vector.body ]
+; UF5-NEXT: [[SUM4:%.+]] = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ [[SUM4_NEXT:%.+]], %vector.body ]
+; UF5-NEXT: [[IV0:%.+]] = add i64 [[IV]], 0
+; UF5-NEXT: [[IV1:%.+]] = add i64 [[IV]], 4
+; UF5-NEXT: [[IV2:%.+]] = add i64 [[IV]], 8
+; UF5-NEXT: [[IV3:%.+]] = add i64 [[IV]], 12
+; UF5-NEXT: [[IV4:%.+]] = add i64 [[IV]], 16
+; UF5-NEXT: [[GEP0:%.+]] = getelementptr inbounds i32, i32* %A, i64 [[IV0]]
+; UF5-NEXT: [[GEP1:%.+]] = getelementptr inbounds i32, i32* %A, i64 [[IV1]]
+; UF5-NEXT: [[GEP2:%.+]] = getelementptr inbounds i32, i32* %A, i64 [[IV2]]
+; UF5-NEXT: [[GEP3:%.+]] = getelementptr inbounds i32, i32* %A, i64 [[IV3]]
+; UF5-NEXT: [[GEP4:%.+]] = getelementptr inbounds i32, i32* %A, i64 [[IV4]]
+; UF5-NEXT: [[L_GEP0:%.+]] = getelementptr inbounds i32, i32* [[GEP0]], i32 0
+; UF5-NEXT: [[BC0:%.+]] = bitcast i32* [[L_GEP0]] to <4 x i32>*
+; UF5-NEXT: [[L0:%.+]] = load <4 x i32>, <4 x i32>* [[BC0]], align 4
+; UF5-NEXT: [[L_GEP1:%.+]] = getelementptr inbounds i32, i32* [[GEP0]], i32 4
+; UF5-NEXT: [[BC1:%.+]] = bitcast i32* [[L_GEP1]] to <4 x i32>*
+; UF5-NEXT: [[L1:%.+]] = load <4 x i32>, <4 x i32>* [[BC1]], align 4
+; UF5-NEXT: [[L_GEP2:%.+]] = getelementptr inbounds i32, i32* [[GEP0]], i32 8
+; UF5-NEXT: [[BC2:%.+]] = bitcast i32* [[L_GEP2]] to <4 x i32>*
+; UF5-NEXT: [[L2:%.+]] = load <4 x i32>, <4 x i32>* [[BC2]], align 4
+; UF5-NEXT: [[L_GEP3:%.+]] = getelementptr inbounds i32, i32* [[GEP0]], i32 12
+; UF5-NEXT: [[BC3:%.+]] = bitcast i32* [[L_GEP3]] to <4 x i32>*
+; UF5-NEXT: [[L3:%.+]] = load <4 x i32>, <4 x i32>* [[BC3]], align 4
+; UF5-NEXT: [[L_GEP4:%.+]] = getelementptr inbounds i32, i32* [[GEP0]], i32 16
+; UF5-NEXT: [[BC4:%.+]] = bitcast i32* [[L_GEP4]] to <4 x i32>*
+; UF5-NEXT: [[L4:%.+]] = load <4 x i32>, <4 x i32>* [[BC4]], align 4
+; UF5-NEXT: [[SUM0_NEXT]] = add <4 x i32> [[SUM0]], [[L0]]
+; UF5-NEXT: [[SUM1_NEXT]] = add <4 x i32> [[SUM1]], [[L1]]
+; UF5-NEXT: [[SUM2_NEXT]] = add <4 x i32> [[SUM2]], [[L2]]
+; UF5-NEXT: [[SUM3_NEXT]] = add <4 x i32> [[SUM3]], [[L3]]
+; UF5-NEXT: [[SUM4_NEXT]] = add <4 x i32> [[SUM4]], [[L4]]
+; UF5-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 20
+; UF5-NEXT: [[EC:%.+]] = icmp eq i64 [[IV_NEXT]], %n.vec
+; UF5-NEXT: br i1 [[EC]], label %middle.block, label %vector.body
+;
+; UF5-LABEL: middle.block:
+; UF5-NEXT: [[RDX0:%.+]] = add <4 x i32> [[SUM1_NEXT]], [[SUM0_NEXT]]
+; UF5-NEXT: [[RDX1:%.+]] = add <4 x i32> [[SUM2_NEXT]], [[RDX0]]
+; UF5-NEXT: [[RDX2:%.+]] = add <4 x i32> [[SUM3_NEXT]], [[RDX1]]
+; UF5-NEXT: [[RDX3:%.+]] = add <4 x i32> [[SUM4_NEXT]], [[RDX2]]
+; UF5-NEXT: call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[RDX3]])
+;
entry:
br label %loop
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -march=r600 -mcpu=cayman -basic-aa -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine < %s | FileCheck %s
; Check vectorization that would ordinarily require a runtime bounds
; space, so this should vectorize normally.
define void @foo(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 %n) #0 {
; CHECK-LABEL: @foo(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; CHECK-NEXT: br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK: for.body.preheader:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK: vector.memcheck:
-; CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[N]] to i16
-; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32 addrspace(1)* [[A:%.*]], i16 [[TMP0]]
-; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32 addrspace(1)* [[B:%.*]], i16 [[TMP0]]
-; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP4]], [[A]]
-; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP]], [[B]]
-; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], -4
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[INDEX]] to i16
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i16 [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32 addrspace(1)* [[TMP2]] to <4 x i32> addrspace(1)*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP3]], align 4, !alias.scope !0
-; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD]], <i32 3, i32 3, i32 3, i32 3>
-; CHECK-NEXT: [[TMP5:%.*]] = trunc i32 [[INDEX]] to i16
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i16 [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32 addrspace(1)* [[TMP6]] to <4 x i32> addrspace(1)*
-; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32> addrspace(1)* [[TMP7]], align 4, !alias.scope !3, !noalias !0
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I_02:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[TMP9:%.*]] = trunc i32 [[I_02]] to i16
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i16 [[TMP9]]
-; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP10]], 3
-; CHECK-NEXT: [[TMP11:%.*]] = trunc i32 [[I_02]] to i16
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i16 [[TMP11]]
-; CHECK-NEXT: store i32 [[MUL]], i32 addrspace(1)* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[INC]] = add nsw i32 [[I_02]], 1
-; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC]], [[N]]
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK: for.end.loopexit:
-; CHECK-NEXT: br label [[FOR_END]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK: <4 x i32>
+; CHECK: ret
entry:
%cmp1 = icmp slt i32 0, %n
; Parameters are unidentified and different address spaces, so cannot vectorize.
define void @bar0(i32* %a, i32 addrspace(1)* %b, i32 %n) #0 {
; CHECK-LABEL: @bar0(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; CHECK-NEXT: br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK: for.body.preheader:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I_02:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[I_02]] to i16
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B:%.*]], i16 [[TMP0]]
-; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP1]], 3
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_02]]
-; CHECK-NEXT: store i32 [[MUL]], i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_02]], 1
-; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC]], [[N]]
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]]
-; CHECK: for.end.loopexit:
-; CHECK-NEXT: br label [[FOR_END]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK-NOT: <4 x i32>
+; CHECK: ret
entry:
%cmp1 = icmp slt i32 0, %n
; Swapped arguments should be the same
define void @bar1(i32 addrspace(1)* %a, i32* %b, i32 %n) #0 {
; CHECK-LABEL: @bar1(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; CHECK-NEXT: br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK: for.body.preheader:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I_02:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[I_02]]
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
-; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[I_02]] to i16
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A:%.*]], i16 [[TMP1]]
-; CHECK-NEXT: store i32 [[MUL]], i32 addrspace(1)* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_02]], 1
-; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC]], [[N]]
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]]
-; CHECK: for.end.loopexit:
-; CHECK-NEXT: br label [[FOR_END]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK-NOT: <4 x i32>
+; CHECK: ret
entry:
%cmp1 = icmp slt i32 0, %n
; address spaces are different.
define void @bar2(i32* noalias %a, i32 addrspace(1)* noalias %b, i32 %n) #0 {
; CHECK-LABEL: @bar2(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; CHECK-NEXT: br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK: for.body.preheader:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], -4
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[INDEX]] to i16
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B:%.*]], i16 [[TMP0]]
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32 addrspace(1)* [[TMP1]] to <4 x i32> addrspace(1)*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], align 4
-; CHECK-NEXT: [[TMP3:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD]], <i32 3, i32 3, i32 3, i32 3>
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[INDEX]]
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
-; CHECK-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* [[TMP5]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I_02:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[TMP7:%.*]] = trunc i32 [[I_02]] to i16
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i16 [[TMP7]]
-; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 3
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[I_02]]
-; CHECK-NEXT: store i32 [[MUL]], i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[INC]] = add nsw i32 [[I_02]], 1
-; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC]], [[N]]
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK: for.end.loopexit:
-; CHECK-NEXT: br label [[FOR_END]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK: <4 x i32>
+; CHECK: ret
entry:
%cmp1 = icmp slt i32 0, %n
; generally safe and shouldn't be vectorized.
define void @arst0(i32* %b, i32 %n) #0 {
; CHECK-LABEL: @arst0(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; CHECK-NEXT: br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK: for.body.preheader:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I_02:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[I_02]]
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
-; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[I_02]] to i16
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(1)* @g_as1, i16 0, i16 [[TMP1]]
-; CHECK-NEXT: store i32 [[MUL]], i32 addrspace(1)* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_02]], 1
-; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC]], [[N]]
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]]
-; CHECK: for.end.loopexit:
-; CHECK-NEXT: br label [[FOR_END]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK-NOT: <4 x i32>
+; CHECK: ret
entry:
%cmp1 = icmp slt i32 0, %n
; This isn't generally safe and shouldn't be vectorized.
define void @arst1(i32* %b, i32 %n) #0 {
; CHECK-LABEL: @arst1(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; CHECK-NEXT: br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK: for.body.preheader:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I_02:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = trunc i32 [[I_02]] to i16
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(1)* @g_as1, i16 0, i16 [[TMP0]]
-; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP1]], 3
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[I_02]]
-; CHECK-NEXT: store i32 [[MUL]], i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_02]], 1
-; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC]], [[N]]
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT:%.*]]
-; CHECK: for.end.loopexit:
-; CHECK-NEXT: br label [[FOR_END]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK-NOT: <4 x i32>
+; CHECK: ret
entry:
%cmp1 = icmp slt i32 0, %n
; spaces. This should be vectorized.
define void @aoeu(i32 %n) #0 {
; CHECK-LABEL: @aoeu(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; CHECK-NEXT: br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
-; CHECK: for.body.preheader:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_VEC:%.*]] = and i32 [[N]], -4
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(2)* @q_as2, i32 0, i32 [[INDEX]]
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 addrspace(2)* [[TMP0]] to <4 x i32> addrspace(2)*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32> addrspace(2)* [[TMP1]], align 16
-; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD]], <i32 3, i32 3, i32 3, i32 3>
-; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[INDEX]] to i16
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(1)* @g_as1, i16 0, i16 [[TMP3]]
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 addrspace(1)* [[TMP4]] to <4 x i32> addrspace(1)*
-; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32> addrspace(1)* [[TMP5]], align 16
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[N]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I_02:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(2)* @q_as2, i32 0, i32 [[I_02]]
-; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32 addrspace(2)* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP7]], 3
-; CHECK-NEXT: [[TMP8:%.*]] = trunc i32 [[I_02]] to i16
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [1024 x i32], [1024 x i32] addrspace(1)* @g_as1, i16 0, i16 [[TMP8]]
-; CHECK-NEXT: store i32 [[MUL]], i32 addrspace(1)* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[INC]] = add nsw i32 [[I_02]], 1
-; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC]], [[N]]
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK: for.end.loopexit:
-; CHECK-NEXT: br label [[FOR_END]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK: <4 x i32>
+; CHECK: ret
entry:
%cmp1 = icmp slt i32 0, %n
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[INDEX]] to i32
-; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], 0
-; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
-; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
-; CHECK-NEXT: [[TMP5:%.*]] = mul i32 [[TMP4]], [[X]]
-; CHECK-NEXT: [[TMP6:%.*]] = zext i32 [[TMP5]] to i64
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP6]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP7]], i32 0
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP9]], align 4
-; CHECK-NEXT: [[TMP10:%.*]] = mul i32 [[TMP1]], [[X]]
-; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP11]]
-; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[TMP12]], i32 0
-; CHECK-NEXT: [[TMP14:%.*]] = bitcast float* [[TMP13]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[WIDE_LOAD]], <4 x float>* [[TMP14]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[INDEX]] to i32
+; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP1]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 1
+; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[TMP4]] to i32
+; CHECK-NEXT: [[TMP6:%.*]] = mul i32 [[TMP5]], [[X]]
+; CHECK-NEXT: [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP7]]
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[TMP8]], i32 0
+; CHECK-NEXT: [[TMP10:%.*]] = bitcast float* [[TMP9]] to <4 x float>*
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP10]], align 4
+; CHECK-NEXT: [[TMP11:%.*]] = mul i32 [[TMP2]], [[X]]
+; CHECK-NEXT: [[TMP12:%.*]] = zext i32 [[TMP11]] to i64
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP12]]
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[TMP13]], i32 0
+; CHECK-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <4 x float>*
+; CHECK-NEXT: store <4 x float> [[WIDE_LOAD]], <4 x float>* [[TMP15]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef
-; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef
+; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 undef, undef
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -loop-vectorize -force-vector-width=2 %s -S -debug 2>&1 | FileCheck %s
; RUN: opt -passes='loop-vectorize' -force-vector-width=2 %s -S -debug 2>&1 | FileCheck %s
; addition of the size of the element type (a pointer) for the end bound.
define void @test(i64 %arg, i32 %arg1, i8** %base) {
-; CHECK-LABEL: @test(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[ARG:%.*]], 1
-; CHECK-NEXT: [[SMIN19:%.*]] = call i64 @llvm.smin.i64(i64 [[ARG]], i64 1)
-; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[SMIN19]]
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 2
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
-; CHECK: vector.scevcheck:
-; CHECK-NEXT: [[SMIN:%.*]] = call i64 @llvm.smin.i64(i64 [[ARG]], i64 1)
-; CHECK-NEXT: [[TMP2:%.*]] = sub i64 [[ARG]], [[SMIN]]
-; CHECK-NEXT: [[TMP3:%.*]] = add nsw i32 [[ARG1:%.*]], -1
-; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP4]])
-; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
-; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
-; CHECK-NEXT: [[TMP5:%.*]] = sub i32 [[TMP3]], [[MUL_RESULT]]
-; CHECK-NEXT: [[TMP6:%.*]] = icmp ugt i32 [[TMP5]], [[TMP3]]
-; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP6]], [[MUL_OVERFLOW]]
-; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[TMP2]], 4294967295
-; CHECK-NEXT: [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
-; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP3]] to i64
-; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8*, i8** [[BASE:%.*]], i64 [[TMP10]]
-; CHECK-NEXT: [[MUL1:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 8, i64 [[TMP2]])
-; CHECK-NEXT: [[MUL_RESULT2:%.*]] = extractvalue { i64, i1 } [[MUL1]], 0
-; CHECK-NEXT: [[MUL_OVERFLOW3:%.*]] = extractvalue { i64, i1 } [[MUL1]], 1
-; CHECK-NEXT: [[SCEVGEP4:%.*]] = bitcast i8** [[SCEVGEP]] to i8*
-; CHECK-NEXT: [[TMP11:%.*]] = sub i64 0, [[MUL_RESULT2]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[SCEVGEP4]], i64 [[TMP11]]
-; CHECK-NEXT: [[TMP13:%.*]] = icmp ugt i8* [[TMP12]], [[SCEVGEP4]]
-; CHECK-NEXT: [[TMP14:%.*]] = or i1 [[TMP13]], [[MUL_OVERFLOW3]]
-; CHECK-NEXT: [[SCEVGEP5:%.*]] = getelementptr i8*, i8** [[BASE]], i64 [[ARG]]
-; CHECK-NEXT: [[MUL6:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 8, i64 [[TMP2]])
-; CHECK-NEXT: [[MUL_RESULT7:%.*]] = extractvalue { i64, i1 } [[MUL6]], 0
-; CHECK-NEXT: [[MUL_OVERFLOW8:%.*]] = extractvalue { i64, i1 } [[MUL6]], 1
-; CHECK-NEXT: [[SCEVGEP59:%.*]] = bitcast i8** [[SCEVGEP5]] to i8*
-; CHECK-NEXT: [[TMP15:%.*]] = sub i64 0, [[MUL_RESULT7]]
-; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i8, i8* [[SCEVGEP59]], i64 [[TMP15]]
-; CHECK-NEXT: [[TMP17:%.*]] = icmp ugt i8* [[TMP16]], [[SCEVGEP59]]
-; CHECK-NEXT: [[TMP18:%.*]] = or i1 [[TMP17]], [[MUL_OVERFLOW8]]
-; CHECK-NEXT: [[TMP19:%.*]] = or i1 [[TMP9]], [[TMP14]]
-; CHECK-NEXT: [[TMP20:%.*]] = or i1 [[TMP19]], [[TMP18]]
-; CHECK-NEXT: br i1 [[TMP20]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK: vector.memcheck:
-; CHECK-NEXT: [[SMIN10:%.*]] = call i64 @llvm.smin.i64(i64 [[ARG]], i64 1)
-; CHECK-NEXT: [[TMP21:%.*]] = add nsw i32 [[ARG1]], -1
-; CHECK-NEXT: [[TMP22:%.*]] = zext i32 [[TMP21]] to i64
-; CHECK-NEXT: [[TMP23:%.*]] = add i64 [[SMIN10]], [[TMP22]]
-; CHECK-NEXT: [[TMP24:%.*]] = sub i64 [[TMP23]], [[ARG]]
-; CHECK-NEXT: [[SCEVGEP11:%.*]] = getelementptr i8*, i8** [[BASE]], i64 [[TMP24]]
-; CHECK-NEXT: [[SCEVGEP1112:%.*]] = bitcast i8** [[SCEVGEP11]] to i8*
-; CHECK-NEXT: [[TMP25:%.*]] = add nuw nsw i64 [[TMP22]], 1
-; CHECK-NEXT: [[SCEVGEP13:%.*]] = getelementptr i8*, i8** [[BASE]], i64 [[TMP25]]
-; CHECK-NEXT: [[SCEVGEP1314:%.*]] = bitcast i8** [[SCEVGEP13]] to i8*
-; CHECK-NEXT: [[SCEVGEP15:%.*]] = getelementptr i8*, i8** [[BASE]], i64 [[SMIN10]]
-; CHECK-NEXT: [[SCEVGEP1516:%.*]] = bitcast i8** [[SCEVGEP15]] to i8*
-; CHECK-NEXT: [[TMP26:%.*]] = add i64 [[ARG]], 1
-; CHECK-NEXT: [[SCEVGEP17:%.*]] = getelementptr i8*, i8** [[BASE]], i64 [[TMP26]]
-; CHECK-NEXT: [[SCEVGEP1718:%.*]] = bitcast i8** [[SCEVGEP17]] to i8*
-; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[SCEVGEP1112]], [[SCEVGEP1718]]
-; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[SCEVGEP1516]], [[SCEVGEP1314]]
-; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 2
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
-; CHECK-NEXT: [[IND_END:%.*]] = sub i64 [[ARG]], [[N_VEC]]
-; CHECK-NEXT: [[CAST_CRD:%.*]] = trunc i64 [[N_VEC]] to i32
-; CHECK-NEXT: [[IND_END21:%.*]] = sub i32 [[ARG1]], [[CAST_CRD]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP27:%.*]] = trunc i64 [[INDEX]] to i32
-; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[ARG1]], [[TMP27]]
-; CHECK-NEXT: [[TMP28:%.*]] = add i32 [[OFFSET_IDX]], 0
-; CHECK-NEXT: [[OFFSET_IDX22:%.*]] = sub i64 [[ARG]], [[INDEX]]
-; CHECK-NEXT: [[TMP29:%.*]] = add i64 [[OFFSET_IDX22]], 0
-; CHECK-NEXT: [[TMP30:%.*]] = add nsw i32 [[TMP28]], -1
-; CHECK-NEXT: [[TMP31:%.*]] = zext i32 [[TMP30]] to i64
-; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8*, i8** [[BASE]], i64 [[TMP31]]
-; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds i8*, i8** [[TMP32]], i32 0
-; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds i8*, i8** [[TMP33]], i32 -1
-; CHECK-NEXT: [[TMP35:%.*]] = bitcast i8** [[TMP34]] to <2 x i8*>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i8*>, <2 x i8*>* [[TMP35]], align 8, !alias.scope !0, !noalias !3
-; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <2 x i8*> [[WIDE_LOAD]], <2 x i8*> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8*, i8** [[BASE]], i64 [[TMP29]]
-; CHECK-NEXT: [[TMP37:%.*]] = getelementptr inbounds i8*, i8** [[TMP36]], i32 0
-; CHECK-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8*, i8** [[TMP37]], i32 -1
-; CHECK-NEXT: [[TMP39:%.*]] = bitcast i8** [[TMP38]] to <2 x i8*>*
-; CHECK-NEXT: [[WIDE_LOAD23:%.*]] = load <2 x i8*>, <2 x i8*>* [[TMP39]], align 8, !alias.scope !3
-; CHECK-NEXT: [[REVERSE24:%.*]] = shufflevector <2 x i8*> [[WIDE_LOAD23]], <2 x i8*> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT: [[REVERSE25:%.*]] = shufflevector <2 x i8*> [[REVERSE24]], <2 x i8*> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT: [[TMP40:%.*]] = bitcast i8** [[TMP34]] to <2 x i8*>*
-; CHECK-NEXT: store <2 x i8*> [[REVERSE25]], <2 x i8*>* [[TMP40]], align 8, !alias.scope !0, !noalias !3
-; CHECK-NEXT: [[REVERSE26:%.*]] = shufflevector <2 x i8*> [[REVERSE]], <2 x i8*> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT: [[TMP41:%.*]] = bitcast i8** [[TMP38]] to <2 x i8*>*
-; CHECK-NEXT: store <2 x i8*> [[REVERSE26]], <2 x i8*>* [[TMP41]], align 8, !alias.scope !3
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP42:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP42]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[ARG]], [[ENTRY:%.*]] ], [ [[ARG]], [[VECTOR_SCEVCHECK]] ], [ [[ARG]], [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT: [[BC_RESUME_VAL20:%.*]] = phi i32 [ [[IND_END21]], [[MIDDLE_BLOCK]] ], [ [[ARG1]], [[ENTRY]] ], [ [[ARG1]], [[VECTOR_SCEVCHECK]] ], [ [[ARG1]], [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT: br label [[LOOP:%.*]]
-; CHECK: loop:
-; CHECK-NEXT: [[IV_1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[IV_2:%.*]] = phi i32 [ [[BC_RESUME_VAL20]], [[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[IV_2_NEXT]] = add nsw i32 [[IV_2]], -1
-; CHECK-NEXT: [[IV_2_EXT:%.*]] = zext i32 [[IV_2_NEXT]] to i64
-; CHECK-NEXT: [[IDX_1:%.*]] = getelementptr inbounds i8*, i8** [[BASE]], i64 [[IV_2_EXT]]
-; CHECK-NEXT: [[V_1:%.*]] = load i8*, i8** [[IDX_1]], align 8
-; CHECK-NEXT: [[IDX_2:%.*]] = getelementptr inbounds i8*, i8** [[BASE]], i64 [[IV_1]]
-; CHECK-NEXT: [[V_2:%.*]] = load i8*, i8** [[IDX_2]], align 8
-; CHECK-NEXT: store i8* [[V_2]], i8** [[IDX_1]], align 8
-; CHECK-NEXT: store i8* [[V_1]], i8** [[IDX_2]], align 8
-; CHECK-NEXT: [[TMP11:%.*]] = icmp sgt i64 [[IV_1]], 1
-; CHECK-NEXT: [[IV_1_NEXT]] = add nsw i64 [[IV_1]], -1
-; CHECK-NEXT: br i1 [[TMP11]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK: exit:
-; CHECK-NEXT: ret void
-;
-
+; CHECK: LAA: Adding RT check for range:
+; CHECK-NEXT: Start: ((8 * (zext i32 (-1 + %arg1)<nsw> to i64))<nuw><nsw> + (8 * (1 smin %arg)) + (-8 * %arg) + %base)
+; CHECK-SAME: End: (8 + (8 * (zext i32 (-1 + %arg1)<nsw> to i64))<nuw><nsw> + %base)
+; CHECK-NEXT: LAA: Adding RT check for range:
+; CHECK-NEXT: Start: ((8 * (1 smin %arg)) + %base)
+; CHECK-SAME: End: (8 + (8 * %arg) + %base)
+
+; CHECK: vector.body
entry:
br label %loop
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -march=r600 -mcpu=cayman -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine < %s | FileCheck %s
; Artificial datalayout
define void @add_ints_1_1_1(i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c) #0 {
; CHECK-LABEL: @add_ints_1_1_1(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK: vector.memcheck:
-; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32 addrspace(1)* [[A:%.*]], i16 200
-; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32 addrspace(1)* [[B:%.*]], i16 200
-; CHECK-NEXT: [[SCEVGEP7:%.*]] = getelementptr i32, i32 addrspace(1)* [[C:%.*]], i16 200
-; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP4]], [[A]]
-; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP]], [[B]]
-; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT: [[BOUND09:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP7]], [[A]]
-; CHECK-NEXT: [[BOUND110:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP]], [[C]]
-; CHECK-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]]
-; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]]
-; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[INDEX]] to i16
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i16 [[TMP0]]
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32 addrspace(1)* [[TMP1]] to <4 x i32> addrspace(1)*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP2]], align 4, !alias.scope !0
-; CHECK-NEXT: [[TMP3:%.*]] = trunc i64 [[INDEX]] to i16
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[C]], i16 [[TMP3]]
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 addrspace(1)* [[TMP4]] to <4 x i32> addrspace(1)*
-; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i32>, <4 x i32> addrspace(1)* [[TMP5]], align 4, !alias.scope !3
-; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[WIDE_LOAD12]]
-; CHECK-NEXT: [[TMP7:%.*]] = trunc i64 [[INDEX]] to i16
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i16 [[TMP7]]
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32 addrspace(1)* [[TMP8]] to <4 x i32> addrspace(1)*
-; CHECK-NEXT: store <4 x i32> [[TMP6]], <4 x i32> addrspace(1)* [[TMP9]], align 4, !alias.scope !5, !noalias !7
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
-; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I_01:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP11:%.*]] = trunc i64 [[I_01]] to i16
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i16 [[TMP11]]
-; CHECK-NEXT: [[TMP12:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[TMP13:%.*]] = trunc i64 [[I_01]] to i16
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[C]], i16 [[TMP13]]
-; CHECK-NEXT: [[TMP14:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX1]], align 4
-; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP14]]
-; CHECK-NEXT: [[TMP15:%.*]] = trunc i64 [[I_01]] to i16
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i16 [[TMP15]]
-; CHECK-NEXT: store i32 [[ADD]], i32 addrspace(1)* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[INC]] = add i64 [[I_01]], 1
-; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[INC]], 200
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK: <4 x i32>
+; CHECK: ret
entry:
br label %for.body
define void @add_ints_as_1_0_0(i32 addrspace(1)* %a, i32* %b, i32* %c) #0 {
; CHECK-LABEL: @add_ints_as_1_0_0(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I_01:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[I_01]] to i32
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[TMP0]]
-; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[I_01]] to i32
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i32 [[TMP2]]
-; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4
-; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP3]]
-; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[I_01]] to i16
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A:%.*]], i16 [[TMP4]]
-; CHECK-NEXT: store i32 [[ADD]], i32 addrspace(1)* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[INC]] = add i64 [[I_01]], 1
-; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[INC]], 200
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK-NOT: <4 x i32>
+; CHECK: ret
entry:
br label %for.body
define void @add_ints_as_0_1_0(i32* %a, i32 addrspace(1)* %b, i32* %c) #0 {
; CHECK-LABEL: @add_ints_as_0_1_0(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I_01:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[I_01]] to i16
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B:%.*]], i16 [[TMP0]]
-; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[I_01]] to i32
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i32 [[TMP2]]
-; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX1]], align 4
-; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP3]]
-; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[I_01]] to i32
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP4]]
-; CHECK-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[INC]] = add i64 [[I_01]], 1
-; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[INC]], 200
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK-NOT: <4 x i32>
+; CHECK: ret
entry:
br label %for.body
define void @add_ints_as_0_1_1(i32* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c) #0 {
; CHECK-LABEL: @add_ints_as_0_1_1(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I_01:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[I_01]] to i16
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B:%.*]], i16 [[TMP0]]
-; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[I_01]] to i16
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[C:%.*]], i16 [[TMP2]]
-; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX1]], align 4
-; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP3]]
-; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[I_01]] to i32
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP4]]
-; CHECK-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[INC]] = add i64 [[I_01]], 1
-; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[INC]], 200
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK-NOT: <4 x i32>
+; CHECK: ret
entry:
br label %for.body
define void @add_ints_as_0_1_2(i32* %a, i32 addrspace(1)* %b, i32 addrspace(2)* %c) #0 {
; CHECK-LABEL: @add_ints_as_0_1_2(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I_01:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[I_01]] to i16
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B:%.*]], i16 [[TMP0]]
-; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[I_01]] to i32
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32 addrspace(2)* [[C:%.*]], i32 [[TMP2]]
-; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32 addrspace(2)* [[ARRAYIDX1]], align 4
-; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP3]]
-; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[I_01]] to i32
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP4]]
-; CHECK-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[INC]] = add i64 [[I_01]], 1
-; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[INC]], 200
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK-NOT: <4 x i32>
+; CHECK: ret
entry:
br label %for.body
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
define void @add_ints(i32* nocapture %A, i32* nocapture %B, i32* nocapture %C) {
; CHECK-LABEL: @add_ints(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK: vector.memcheck:
+; CHECK-LABEL: vector.memcheck:
; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 200
; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 200
; CHECK-NEXT: [[SCEVGEP7:%.*]] = getelementptr i32, i32* [[C:%.*]], i64 200
; CHECK-NEXT: [[BOUND110:%.*]] = icmp ugt i32* [[SCEVGEP]], [[C]]
; CHECK-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]]
; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]]
-; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label %scalar.ph, label %vector.ph
; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT: br label %vector.body
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4, !alias.scope !0
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4, !alias.scope !3
-; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD12]], [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
-; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP6]], align 4, !alias.scope !5, !noalias !7
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
-; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 200, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP9]], [[TMP8]]
-; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], 200
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
;
entry:
br label %for.body
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
; CHECK: vector.scevcheck:
; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1
-; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[TMP0]], 3
-; CHECK-NEXT: br i1 [[TMP1]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP0]], 3
+; CHECK-NEXT: br i1 [[TMP7]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]]
; CHECK: vector.memcheck:
-; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[N]], -1
-; CHECK-NEXT: [[TMP3:%.*]] = zext i32 [[TMP2]] to i64
-; CHECK-NEXT: [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 1
-; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP4]]
+; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64
+; CHECK-NEXT: [[TMP12:%.*]] = add nuw nsw i64 [[TMP11]], 1
+; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP12]]
; CHECK-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
-; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP4]]
+; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP12]]
; CHECK-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8*
; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[B1]], [[SCEVGEP45]]
; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[A3]], [[SCEVGEP2]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT: [[TMP6:%.*]] = urem i32 [[TMP5]], 4
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP6]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <2 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP9]], align 4, !alias.scope !0
-; CHECK-NEXT: [[TMP10:%.*]] = add <2 x i32> [[WIDE_LOAD]], <i32 10, i32 10>
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[TMP5]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <2 x i32>*
-; CHECK-NEXT: store <2 x i32> [[TMP10]], <2 x i32>* [[TMP13]], align 4, !alias.scope !3, !noalias !0
+; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT: [[TMP14:%.*]] = urem i32 [[TMP13]], 4
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP14]]
+; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP15]], i32 0
+; CHECK-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <2 x i32>*
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP17]], align 4, !alias.scope !0
+; CHECK-NEXT: [[TMP18:%.*]] = add <2 x i32> [[WIDE_LOAD]], <i32 10, i32 10>
+; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[TMP13]]
+; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP19]], i32 0
+; CHECK-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <2 x i32>*
+; CHECK-NEXT: store <2 x i32> [[TMP18]], <2 x i32>* [[TMP21]], align 4, !alias.scope !3, !noalias !0
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
; CHECK: vector.scevcheck:
; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1
-; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[TMP0]], 3
-; CHECK-NEXT: br i1 [[TMP1]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP0]], 3
+; CHECK-NEXT: br i1 [[TMP7]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]]
; CHECK: vector.memcheck:
-; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[N]], -1
-; CHECK-NEXT: [[TMP3:%.*]] = zext i32 [[TMP2]] to i64
-; CHECK-NEXT: [[TMP4:%.*]] = add nuw nsw i64 [[TMP3]], 1
-; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP4]]
+; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64
+; CHECK-NEXT: [[TMP12:%.*]] = add nuw nsw i64 [[TMP11]], 1
+; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[B]], i64 [[TMP12]]
; CHECK-NEXT: [[SCEVGEP2:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
-; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP4]]
+; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i32, i32* [[A]], i64 [[TMP12]]
; CHECK-NEXT: [[SCEVGEP45:%.*]] = bitcast i32* [[SCEVGEP4]] to i8*
; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[B1]], [[SCEVGEP45]]
; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[A3]], [[SCEVGEP2]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT: [[TMP6:%.*]] = urem i32 [[TMP5]], 4
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[TMP5]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <2 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP9]], align 4, !alias.scope !8, !noalias !11
-; CHECK-NEXT: [[TMP10:%.*]] = add <2 x i32> [[WIDE_LOAD]], <i32 10, i32 10>
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP6]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <2 x i32>*
-; CHECK-NEXT: store <2 x i32> [[TMP10]], <2 x i32>* [[TMP13]], align 4, !alias.scope !11
+; CHECK-NEXT: [[TMP13:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT: [[TMP14:%.*]] = urem i32 [[TMP13]], 4
+; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[TMP13]]
+; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP15]], i32 0
+; CHECK-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <2 x i32>*
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP17]], align 4, !alias.scope !8, !noalias !11
+; CHECK-NEXT: [[TMP18:%.*]] = add <2 x i32> [[WIDE_LOAD]], <i32 10, i32 10>
+; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP14]]
+; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP19]], i32 0
+; CHECK-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <2 x i32>*
+; CHECK-NEXT: store <2 x i32> [[TMP18]], <2 x i32>* [[TMP21]], align 4, !alias.scope !11
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
; CHECK: vector.scevcheck:
; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1
-; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt i32 [[TMP0]], 3
-; CHECK-NEXT: br i1 [[TMP1]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP0]], 3
+; CHECK-NEXT: br i1 [[TMP7]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 2
; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT: [[TMP3:%.*]] = urem i32 [[TMP2]], 4
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP3]]
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <2 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP6]], align 4
-; CHECK-NEXT: [[TMP7:%.*]] = add <2 x i32> [[WIDE_LOAD]], <i32 10, i32 10>
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP5]] to <2 x i32>*
-; CHECK-NEXT: store <2 x i32> [[TMP7]], <2 x i32>* [[TMP8]], align 4
+; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT: [[TMP11:%.*]] = urem i32 [[TMP10]], 4
+; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP11]]
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i32 0
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <2 x i32>*
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP14]], align 4
+; CHECK-NEXT: [[TMP15:%.*]] = add <2 x i32> [[WIDE_LOAD]], <i32 10, i32 10>
+; CHECK-NEXT: [[TMP16:%.*]] = bitcast i32* [[TMP13]] to <2 x i32>*
+; CHECK-NEXT: store <2 x i32> [[TMP15]], <2 x i32>* [[TMP16]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -scalable-vectorization=on -force-target-supports-scalable-vectors=true -loop-vectorize -force-vector-width=2 -force-vector-interleave=2 -S | FileCheck %s
define void @test1(float* noalias nocapture %a, float* noalias nocapture readonly %b) {
; CHECK-LABEL: @test1(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1600, [[TMP1]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1600, [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1600, [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2
-; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP9]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <vscale x 2 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x float>, <vscale x 2 x float>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], 2
-; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 [[TMP15]]
-; CHECK-NEXT: [[TMP17:%.*]] = bitcast float* [[TMP16]] to <vscale x 2 x float>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 2 x float>, <vscale x 2 x float>* [[TMP17]], align 4
-; CHECK-NEXT: [[TMP18:%.*]] = fcmp ogt <vscale x 2 x float> [[WIDE_LOAD]], shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> poison, float 1.000000e+02, i32 0), <vscale x 2 x float> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP19:%.*]] = fcmp ogt <vscale x 2 x float> [[WIDE_LOAD1]], shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> poison, float 1.000000e+02, i32 0), <vscale x 2 x float> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP20:%.*]] = extractelement <vscale x 2 x i1> [[TMP18]], i32 0
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP20]])
-; CHECK-NEXT: [[TMP21:%.*]] = extractelement <vscale x 2 x i1> [[TMP19]], i32 0
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP21]])
-; CHECK-NEXT: [[TMP22:%.*]] = fadd <vscale x 2 x float> [[WIDE_LOAD]], shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> poison, float 1.000000e+00, i32 0), <vscale x 2 x float> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP23:%.*]] = fadd <vscale x 2 x float> [[WIDE_LOAD1]], shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> poison, float 1.000000e+00, i32 0), <vscale x 2 x float> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP9]]
-; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP24]], i32 0
-; CHECK-NEXT: [[TMP27:%.*]] = bitcast float* [[TMP26]] to <vscale x 2 x float>*
-; CHECK-NEXT: store <vscale x 2 x float> [[TMP22]], <vscale x 2 x float>* [[TMP27]], align 4
-; CHECK-NEXT: [[TMP28:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP29:%.*]] = mul i32 [[TMP28]], 2
-; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[TMP24]], i32 [[TMP29]]
-; CHECK-NEXT: [[TMP31:%.*]] = bitcast float* [[TMP30]] to <vscale x 2 x float>*
-; CHECK-NEXT: store <vscale x 2 x float> [[TMP23]], <vscale x 2 x float>* [[TMP31]], align 4
-; CHECK-NEXT: [[TMP32:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP33:%.*]] = mul i64 [[TMP32]], 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP33]]
-; CHECK-NEXT: [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP34]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1600, [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP35:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP35]], 1.000000e+02
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP1]])
-; CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP35]], 1.000000e+00
-; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store float [[ADD]], float* [[ARRAYIDX5]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV]], 1599
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK: [[FCMP1:%.*]] = fcmp ogt <vscale x 2 x float>
+; CHECK-NEXT: [[FCMP2:%.*]] = fcmp ogt <vscale x 2 x float>
+; CHECK-NEXT: [[FCMP1L0:%.*]] = extractelement <vscale x 2 x i1> [[FCMP1]], i32 0
+; CHECK-NEXT: tail call void @llvm.assume(i1 [[FCMP1L0]])
+; CHECK-NEXT: [[FCMP2L0:%.*]] = extractelement <vscale x 2 x i1> [[FCMP2]], i32 0
+; CHECK-NEXT: tail call void @llvm.assume(i1 [[FCMP2L0]])
entry:
br label %for.body
define void @test2(float *%a, float *%b) {
; CHECK-LABEL: @test2(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[B1:%.*]] = bitcast float* [[B:%.*]] to i8*
-; CHECK-NEXT: [[A3:%.*]] = bitcast float* [[A:%.*]] to i8*
-; CHECK-NEXT: [[PTRINT1:%.*]] = ptrtoint float* [[A]] to i64
-; CHECK-NEXT: [[MASKCOND:%.*]] = icmp eq i64 [[PTRINT1]], 0
-; CHECK-NEXT: [[PTRINT2:%.*]] = ptrtoint float* [[B]] to i64
-; CHECK-NEXT: [[MASKCOND4:%.*]] = icmp eq i64 [[PTRINT2]], 0
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1600, [[TMP1]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK: vector.memcheck:
-; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[B]], i64 1600
-; CHECK-NEXT: [[SCEVGEP2:%.*]] = bitcast float* [[SCEVGEP]] to i8*
-; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr float, float* [[A]], i64 1600
-; CHECK-NEXT: [[SCEVGEP45:%.*]] = bitcast float* [[SCEVGEP4]] to i8*
-; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[B1]], [[SCEVGEP45]]
-; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[A3]], [[SCEVGEP2]]
-; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1600, [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1600, [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: entry:
+; CHECK: [[MASKCOND:%.*]] = icmp eq i64 %ptrint1, 0
+; CHECK: [[MASKCOND4:%.*]] = icmp eq i64 %ptrint2, 0
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2
-; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]])
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]])
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP9]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <vscale x 2 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x float>, <vscale x 2 x float>* [[TMP13]], align 4, !alias.scope !4
-; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], 2
-; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 [[TMP15]]
-; CHECK-NEXT: [[TMP17:%.*]] = bitcast float* [[TMP16]] to <vscale x 2 x float>*
-; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <vscale x 2 x float>, <vscale x 2 x float>* [[TMP17]], align 4, !alias.scope !4
-; CHECK-NEXT: [[TMP18:%.*]] = fadd <vscale x 2 x float> [[WIDE_LOAD]], shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> poison, float 1.000000e+00, i32 0), <vscale x 2 x float> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP19:%.*]] = fadd <vscale x 2 x float> [[WIDE_LOAD6]], shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> poison, float 1.000000e+00, i32 0), <vscale x 2 x float> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND4]])
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND4]])
-; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP9]]
-; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 0
-; CHECK-NEXT: [[TMP23:%.*]] = bitcast float* [[TMP22]] to <vscale x 2 x float>*
-; CHECK-NEXT: store <vscale x 2 x float> [[TMP18]], <vscale x 2 x float>* [[TMP23]], align 4, !alias.scope !7, !noalias !4
-; CHECK-NEXT: [[TMP24:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP25:%.*]] = mul i32 [[TMP24]], 2
-; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 [[TMP25]]
-; CHECK-NEXT: [[TMP27:%.*]] = bitcast float* [[TMP26]] to <vscale x 2 x float>*
-; CHECK-NEXT: store <vscale x 2 x float> [[TMP19]], <vscale x 2 x float>* [[TMP27]], align 4, !alias.scope !7, !noalias !4
-; CHECK-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP29]]
-; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1600, [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK: tail call void @llvm.assume(i1 [[MASKCOND]])
; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND]])
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP31:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP31]], 1.000000e+00
+; CHECK: tail call void @llvm.assume(i1 [[MASKCOND4]])
; CHECK-NEXT: tail call void @llvm.assume(i1 [[MASKCOND4]])
-; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store float [[ADD]], float* [[ARRAYIDX5]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV]], 1599
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
entry:
%ptrint1 = ptrtoint float* %a to i64
%maskcond = icmp eq i64 %ptrint1, 0
define void @predicated_assume(float* noalias nocapture readonly %a, float* noalias nocapture %b, i64 %n) {
; Check that the vector.body does not contain any assumes.
; CHECK-LABEL: @predicated_assume(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
-; CHECK-NEXT: [[TMP5:%.*]] = add <vscale x 2 x i64> [[TMP4]], zeroinitializer
-; CHECK-NEXT: [[TMP6:%.*]] = mul <vscale x 2 x i64> [[TMP5]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP6]]
-; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2
-; CHECK-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP8]]
-; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP9]], i32 0
-; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[STEP_ADD:%.*]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 2
-; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[TMP12]], 0
-; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 1
-; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], [[TMP14]]
-; CHECK-NEXT: [[TMP16:%.*]] = icmp ult <vscale x 2 x i64> [[VEC_IND]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 495616, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP17:%.*]] = icmp ult <vscale x 2 x i64> [[STEP_ADD]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 495616, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP18:%.*]] = icmp ult <vscale x 2 x i64> [[VEC_IND]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 991232, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP19:%.*]] = icmp ult <vscale x 2 x i64> [[STEP_ADD]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 991232, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP20:%.*]] = xor <vscale x 2 x i1> [[TMP16]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i32 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP21:%.*]] = xor <vscale x 2 x i1> [[TMP17]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i32 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP16]], <vscale x 2 x float> shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> poison, float 2.300000e+01, i32 0), <vscale x 2 x float> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x float> shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> poison, float 4.200000e+01, i32 0), <vscale x 2 x float> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT: [[PREDPHI2:%.*]] = select <vscale x 2 x i1> [[TMP17]], <vscale x 2 x float> shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> poison, float 2.300000e+01, i32 0), <vscale x 2 x float> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x float> shufflevector (<vscale x 2 x float> insertelement (<vscale x 2 x float> poison, float 4.200000e+01, i32 0), <vscale x 2 x float> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP10]]
-; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP15]]
-; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP22]], i32 0
-; CHECK-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to <vscale x 2 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x float>, <vscale x 2 x float>* [[TMP25]], align 4
-; CHECK-NEXT: [[TMP26:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP27:%.*]] = mul i32 [[TMP26]], 2
-; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP22]], i32 [[TMP27]]
-; CHECK-NEXT: [[TMP29:%.*]] = bitcast float* [[TMP28]] to <vscale x 2 x float>*
-; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 2 x float>, <vscale x 2 x float>* [[TMP29]], align 4
-; CHECK-NEXT: [[TMP30:%.*]] = fmul <vscale x 2 x float> [[PREDPHI]], [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP31:%.*]] = fmul <vscale x 2 x float> [[PREDPHI2]], [[WIDE_LOAD3]]
-; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP10]]
-; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP15]]
-; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float* [[TMP32]], i32 0
-; CHECK-NEXT: [[TMP35:%.*]] = bitcast float* [[TMP34]] to <vscale x 2 x float>*
-; CHECK-NEXT: store <vscale x 2 x float> [[TMP30]], <vscale x 2 x float>* [[TMP35]], align 4
-; CHECK-NEXT: [[TMP36:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP37:%.*]] = mul i32 [[TMP36]], 2
-; CHECK-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, float* [[TMP32]], i32 [[TMP37]]
-; CHECK-NEXT: [[TMP39:%.*]] = bitcast float* [[TMP38]] to <vscale x 2 x float>*
-; CHECK-NEXT: store <vscale x 2 x float> [[TMP31]], <vscale x 2 x float>* [[TMP39]], align 4
-; CHECK-NEXT: [[TMP40:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP41:%.*]] = mul i64 [[TMP40]], 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP41]]
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[STEP_ADD]], [[DOTSPLAT]]
-; CHECK-NEXT: [[TMP42:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP42]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-NOT: llvm.assume
; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[IF_END5:%.*]] ]
-; CHECK-NEXT: [[CMP1:%.*]] = icmp ult i64 [[INDVARS_IV]], 495616
-; CHECK-NEXT: br i1 [[CMP1]], label [[IF_END5]], label [[IF_ELSE:%.*]]
-; CHECK: if.else:
-; CHECK-NEXT: [[CMP2:%.*]] = icmp ult i64 [[INDVARS_IV]], 991232
-; CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP2]])
-; CHECK-NEXT: br label [[IF_END5]]
-; CHECK: if.end5:
-; CHECK-NEXT: [[X_0:%.*]] = phi float [ 4.200000e+01, [[IF_ELSE]] ], [ 2.300000e+01, [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP43:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[MUL:%.*]] = fmul float [[X_0]], [[TMP43]]
-; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store float [[MUL]], float* [[ARRAYIDX7]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: ret void
-;
entry:
br label %for.body
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -scalable-vectorization=on -force-target-supports-scalable-vectors=true -loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -S | FileCheck %s
define void @test1(float* noalias nocapture %a, float* noalias nocapture readonly %b) {
-; CHECK-LABEL: @test1(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1600, [[TMP1]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1600, [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1600, [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
-; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP9]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <vscale x 4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[TMP13]], align 4
-; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], 4
-; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 [[TMP15]]
-; CHECK-NEXT: [[TMP17:%.*]] = bitcast float* [[TMP16]] to <vscale x 4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[TMP17]], align 4
-; CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META0:![0-9]+]])
-; CHECK-NEXT: [[TMP18:%.*]] = fadd <vscale x 4 x float> [[WIDE_LOAD]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 1.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP19:%.*]] = fadd <vscale x 4 x float> [[WIDE_LOAD1]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 1.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP9]]
-; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 0
-; CHECK-NEXT: [[TMP23:%.*]] = bitcast float* [[TMP22]] to <vscale x 4 x float>*
-; CHECK-NEXT: store <vscale x 4 x float> [[TMP18]], <vscale x 4 x float>* [[TMP23]], align 4
-; CHECK-NEXT: [[TMP24:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP25:%.*]] = mul i32 [[TMP24]], 4
-; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 [[TMP25]]
-; CHECK-NEXT: [[TMP27:%.*]] = bitcast float* [[TMP26]] to <vscale x 4 x float>*
-; CHECK-NEXT: store <vscale x 4 x float> [[TMP19]], <vscale x 4 x float>* [[TMP27]], align 4
-; CHECK-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 8
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP29]]
-; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1600, [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP31:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[CMP1:%.*]] = fcmp ogt float [[TMP31]], 1.000000e+02
-; CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META0]])
-; CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP31]], 1.000000e+00
-; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store float [[ADD]], float* [[ARRAYIDX5]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV]], 1599
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
entry:
br label %for.body
+; CHECK-LABEL: @test1
+; CHECK: vector.body:
+; CHECK: @llvm.experimental.noalias.scope.decl
+; CHECK-NOT: @llvm.experimental.noalias.scope.decl
+; CHECK: for.body:
+; CHECK: @llvm.experimental.noalias.scope.decl
+; CHECK-NOT: @llvm.experimental.noalias.scope.decl
+; CHECK: ret void
for.body: ; preds = %for.body, %entry
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
%struct.data = type { float*, float* }
define void @test2(float* %a, float* %b) {
-; CHECK-LABEL: @test2(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[A1:%.*]] = bitcast float* [[A:%.*]] to i8*
-; CHECK-NEXT: [[B3:%.*]] = bitcast float* [[B:%.*]] to i8*
-; CHECK-NEXT: [[PTRINT:%.*]] = ptrtoint float* [[B]] to i64
-; CHECK-NEXT: [[MASKCOND:%.*]] = icmp eq i64 [[PTRINT]], 0
-; CHECK-NEXT: [[PTRINT2:%.*]] = ptrtoint float* [[A]] to i64
-; CHECK-NEXT: [[MASKCOND4:%.*]] = icmp eq i64 [[PTRINT2]], 0
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1600, [[TMP1]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK: vector.memcheck:
-; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[A]], i64 1600
-; CHECK-NEXT: [[SCEVGEP2:%.*]] = bitcast float* [[SCEVGEP]] to i8*
-; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr float, float* [[B]], i64 1600
-; CHECK-NEXT: [[SCEVGEP45:%.*]] = bitcast float* [[SCEVGEP4]] to i8*
-; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[A1]], [[SCEVGEP45]]
-; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[B3]], [[SCEVGEP2]]
-; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1600, [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1600, [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4
-; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0
-; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1
-; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
-; CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META0]])
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP9]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast float* [[TMP12]] to <vscale x 4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[TMP13]], align 4, !alias.scope !7
-; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], 4
-; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP10]], i32 [[TMP15]]
-; CHECK-NEXT: [[TMP17:%.*]] = bitcast float* [[TMP16]] to <vscale x 4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[TMP17]], align 4, !alias.scope !7
-; CHECK-NEXT: [[TMP18:%.*]] = fadd <vscale x 4 x float> [[WIDE_LOAD]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 1.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP19:%.*]] = fadd <vscale x 4 x float> [[WIDE_LOAD6]], shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 1.000000e+00, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META10:![0-9]+]])
-; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP4]]
-; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP9]]
-; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 0
-; CHECK-NEXT: [[TMP23:%.*]] = bitcast float* [[TMP22]] to <vscale x 4 x float>*
-; CHECK-NEXT: store <vscale x 4 x float> [[TMP18]], <vscale x 4 x float>* [[TMP23]], align 4, !alias.scope !12, !noalias !7
-; CHECK-NEXT: [[TMP24:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP25:%.*]] = mul i32 [[TMP24]], 4
-; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[TMP20]], i32 [[TMP25]]
-; CHECK-NEXT: [[TMP27:%.*]] = bitcast float* [[TMP26]] to <vscale x 4 x float>*
-; CHECK-NEXT: store <vscale x 4 x float> [[TMP19]], <vscale x 4 x float>* [[TMP27]], align 4, !alias.scope !12, !noalias !7
-; CHECK-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 8
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP29]]
-; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1600, [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META0]])
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP31:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[ADD:%.*]] = fadd float [[TMP31]], 1.000000e+00
-; CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META10]])
-; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store float [[ADD]], float* [[ARRAYIDX5]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV]], 1599
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK-LABEL: @test2
+; CHECK: vector.body:
+; CHECK: @llvm.experimental.noalias.scope.decl(metadata [[SCOPE0_LIST:!.*]])
+; CHECK: @llvm.experimental.noalias.scope.decl(metadata [[SCOPE4_LIST:!.*]])
+; CHECK-NOT: @llvm.experimental.noalias.scope.decl
+; CHECK: for.body:
+; CHECK: @llvm.experimental.noalias.scope.decl(metadata [[SCOPE0_LIST]])
+; CHECK: @llvm.experimental.noalias.scope.decl(metadata [[SCOPE4_LIST]])
+; CHECK-NOT: @llvm.experimental.noalias.scope.decl
+; CHECK: ret void
entry:
%ptrint = ptrtoint float* %b to i64
%maskcond = icmp eq i64 %ptrint, 0
}
define void @predicated_noalias_scope_decl(float* noalias nocapture readonly %a, float* noalias nocapture %b, i64 %n) {
+
; Check that the vector.body still contains a llvm.experimental.noalias.scope.decl
+
; CHECK-LABEL: @predicated_noalias_scope_decl(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], [[TMP1]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
-; CHECK-NEXT: [[TMP5:%.*]] = add <vscale x 4 x i64> [[TMP4]], zeroinitializer
-; CHECK-NEXT: [[TMP6:%.*]] = mul <vscale x 4 x i64> [[TMP5]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 4 x i64> zeroinitializer, [[TMP6]]
-; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4
-; CHECK-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP8]]
-; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP9]], i32 0
-; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[STEP_ADD:%.*]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
-; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 4
-; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[TMP12]], 0
-; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 1
-; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], [[TMP14]]
-; CHECK-NEXT: [[TMP16:%.*]] = icmp ult <vscale x 4 x i64> [[VEC_IND]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 495616, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP17:%.*]] = icmp ult <vscale x 4 x i64> [[STEP_ADD]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 495616, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP18:%.*]] = icmp ult <vscale x 4 x i64> [[VEC_IND]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 991232, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP19:%.*]] = icmp ult <vscale x 4 x i64> [[STEP_ADD]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 991232, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META0]])
-; CHECK-NEXT: [[TMP20:%.*]] = xor <vscale x 4 x i1> [[TMP16]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP21:%.*]] = xor <vscale x 4 x i1> [[TMP17]], shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT: [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP16]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 2.300000e+01, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 4.200000e+01, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT: [[PREDPHI2:%.*]] = select <vscale x 4 x i1> [[TMP17]], <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 2.300000e+01, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> poison, float 4.200000e+01, i32 0), <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[TMP10]]
-; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP15]]
-; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[TMP22]], i32 0
-; CHECK-NEXT: [[TMP25:%.*]] = bitcast float* [[TMP24]] to <vscale x 4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[TMP25]], align 4
-; CHECK-NEXT: [[TMP26:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP27:%.*]] = mul i32 [[TMP26]], 4
-; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[TMP22]], i32 [[TMP27]]
-; CHECK-NEXT: [[TMP29:%.*]] = bitcast float* [[TMP28]] to <vscale x 4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[TMP29]], align 4
-; CHECK-NEXT: [[TMP30:%.*]] = fmul <vscale x 4 x float> [[PREDPHI]], [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP31:%.*]] = fmul <vscale x 4 x float> [[PREDPHI2]], [[WIDE_LOAD3]]
-; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[TMP10]]
-; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[TMP15]]
-; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float* [[TMP32]], i32 0
-; CHECK-NEXT: [[TMP35:%.*]] = bitcast float* [[TMP34]] to <vscale x 4 x float>*
-; CHECK-NEXT: store <vscale x 4 x float> [[TMP30]], <vscale x 4 x float>* [[TMP35]], align 4
-; CHECK-NEXT: [[TMP36:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP37:%.*]] = mul i32 [[TMP36]], 4
-; CHECK-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, float* [[TMP32]], i32 [[TMP37]]
-; CHECK-NEXT: [[TMP39:%.*]] = bitcast float* [[TMP38]] to <vscale x 4 x float>*
-; CHECK-NEXT: store <vscale x 4 x float> [[TMP31]], <vscale x 4 x float>* [[TMP39]], align 4
-; CHECK-NEXT: [[TMP40:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT: [[TMP41:%.*]] = mul i64 [[TMP40]], 8
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP41]]
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[STEP_ADD]], [[DOTSPLAT]]
-; CHECK-NEXT: [[TMP42:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP42]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[IF_END5:%.*]] ]
-; CHECK-NEXT: [[CMP1:%.*]] = icmp ult i64 [[INDVARS_IV]], 495616
-; CHECK-NEXT: br i1 [[CMP1]], label [[IF_END5]], label [[IF_ELSE:%.*]]
-; CHECK: if.else:
-; CHECK-NEXT: [[CMP2:%.*]] = icmp ult i64 [[INDVARS_IV]], 991232
-; CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META0]])
-; CHECK-NEXT: br label [[IF_END5]]
-; CHECK: if.end5:
-; CHECK-NEXT: [[X_0:%.*]] = phi float [ 4.200000e+01, [[IF_ELSE]] ], [ 2.300000e+01, [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP43:%.*]] = load float, float* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[MUL:%.*]] = fmul float [[X_0]], [[TMP43]]
-; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store float [[MUL]], float* [[ARRAYIDX7]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
-; CHECK: for.cond.cleanup:
-; CHECK-NEXT: ret void
-;
+; CHECK: vector.body:
+; CHECK: call void @llvm.experimental.noalias.scope.decl
+; CHECK-NOT: @llvm.experimental.noalias.scope.decl
+; CHECK: scalar.ph:
+; CHECK-NOT: @llvm.experimental.noalias.scope.decl
+; CHECK: if.else:
+; CHECK: call void @llvm.experimental.noalias.scope.decl
+; CHECK-NOT: @llvm.experimental.noalias.scope.decl
+; CHECK: }
entry:
br label %for.body
!5 = distinct !{!5, !6}
!6 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+; CHECK: [[SCOPE0_LIST]] = !{[[SCOPE0:!.*]]}
+; CHECK: [[SCOPE0]] = distinct !{[[SCOPE0]], [[SCOPE0_DOM:!.*]]}
+; CHECK: [[SCOPE0_DOM]] = distinct !{[[SCOPE0_DOM]]}
+; CHECK: [[SCOPE4_LIST]] = !{[[SCOPE4:!.*]]}
+; CHECK: [[SCOPE4]] = distinct !{[[SCOPE4]], [[SCOPE0_DOM]]}
define i8 @reduction_add_trunc(i8* noalias nocapture %A) {
; CHECK-LABEL: @reduction_add_trunc(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], 16
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 256, [[TMP1]]
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP3:%.*]] = mul i32 [[TMP2]], 16
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 256, [[TMP3]]
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 256, [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ insertelement (<vscale x 8 x i32> zeroinitializer, i32 255, i32 0), [[VECTOR_PH]] ], [ [[TMP28:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP6:%.*]] = mul i32 [[TMP5]], 8
-; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP6]], 0
-; CHECK-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], 1
-; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[INDEX]], [[TMP8]]
-; CHECK-NEXT: [[TMP10:%.*]] = and <vscale x 8 x i32> [[VEC_PHI]], shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 255, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP11:%.*]] = and <vscale x 8 x i32> [[VEC_PHI1]], shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 255, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i32 [[TMP4]]
-; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, i8* [[A]], i32 [[TMP9]]
-; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, i8* [[TMP12]], i32 0
-; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <vscale x 8 x i8>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, <vscale x 8 x i8>* [[TMP15]], align 4
-; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP17:%.*]] = mul i32 [[TMP16]], 8
-; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, i8* [[TMP12]], i32 [[TMP17]]
-; CHECK-NEXT: [[TMP19:%.*]] = bitcast i8* [[TMP18]] to <vscale x 8 x i8>*
-; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, <vscale x 8 x i8>* [[TMP19]], align 4
-; CHECK-NEXT: [[TMP20:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
-; CHECK-NEXT: [[TMP21:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
-; CHECK-NEXT: [[TMP22:%.*]] = add <vscale x 8 x i32> [[TMP10]], [[TMP20]]
-; CHECK-NEXT: [[TMP23:%.*]] = add <vscale x 8 x i32> [[TMP11]], [[TMP21]]
-; CHECK-NEXT: [[TMP24:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: [[TMP25:%.*]] = mul i32 [[TMP24]], 16
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP25]]
-; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: [[TMP27:%.*]] = trunc <vscale x 8 x i32> [[TMP22]] to <vscale x 8 x i8>
-; CHECK-NEXT: [[TMP28]] = zext <vscale x 8 x i8> [[TMP27]] to <vscale x 8 x i32>
-; CHECK-NEXT: [[TMP29:%.*]] = trunc <vscale x 8 x i32> [[TMP23]] to <vscale x 8 x i8>
-; CHECK-NEXT: [[TMP30]] = zext <vscale x 8 x i8> [[TMP29]] to <vscale x 8 x i32>
-; CHECK-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ insertelement (<vscale x 8 x i32> zeroinitializer, i32 255, i32 0), %vector.ph ], [ [[TMP34:%.*]], %vector.body ]
+; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, %vector.ph ], [ [[TMP36:%.*]], %vector.body ]
+; CHECK: [[TMP14:%.*]] = and <vscale x 8 x i32> [[VEC_PHI]], shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 255, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK-NEXT: [[TMP15:%.*]] = and <vscale x 8 x i32> [[VEC_PHI1]], shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 255, i32 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer)
+; CHECK: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, <vscale x 8 x i8>*
+; CHECK: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, <vscale x 8 x i8>*
+; CHECK-NEXT: [[TMP26:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-NEXT: [[TMP27:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-NEXT: [[TMP28:%.*]] = add <vscale x 8 x i32> [[TMP14]], [[TMP26]]
+; CHECK-NEXT: [[TMP29:%.*]] = add <vscale x 8 x i32> [[TMP15]], [[TMP27]]
+; CHECK-NEXT: [[TMP30:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT: [[TMP31:%.*]] = mul i32 [[TMP30]], 16
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP31]]
+; CHECK-NEXT: [[TMP32:%.*]] = icmp eq i32 [[INDEX_NEXT]], {{%.*}}
+; CHECK-NEXT: [[TMP33:%.*]] = trunc <vscale x 8 x i32> [[TMP28]] to <vscale x 8 x i8>
+; CHECK-NEXT: [[TMP34]] = zext <vscale x 8 x i8> [[TMP33]] to <vscale x 8 x i32>
+; CHECK-NEXT: [[TMP35:%.*]] = trunc <vscale x 8 x i32> [[TMP29]] to <vscale x 8 x i8>
+; CHECK-NEXT: [[TMP36]] = zext <vscale x 8 x i8> [[TMP35]] to <vscale x 8 x i32>
; CHECK: middle.block:
-; CHECK-NEXT: [[TMP31:%.*]] = trunc <vscale x 8 x i32> [[TMP28]] to <vscale x 8 x i8>
-; CHECK-NEXT: [[TMP32:%.*]] = trunc <vscale x 8 x i32> [[TMP30]] to <vscale x 8 x i8>
-; CHECK-NEXT: [[BIN_RDX:%.*]] = add <vscale x 8 x i8> [[TMP32]], [[TMP31]]
-; CHECK-NEXT: [[TMP33:%.*]] = call i8 @llvm.vector.reduce.add.nxv8i8(<vscale x 8 x i8> [[BIN_RDX]])
-; CHECK-NEXT: [[TMP34:%.*]] = zext i8 [[TMP33]] to i32
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 256, [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 255, [[ENTRY]] ], [ [[TMP34]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: br label [[LOOP:%.*]]
-; CHECK: loop:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[SUM_02P:%.*]] = phi i32 [ [[L9:%.*]], [[LOOP]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[SUM_02:%.*]] = and i32 [[SUM_02P]], 255
-; CHECK-NEXT: [[L2:%.*]] = getelementptr inbounds i8, i8* [[A]], i32 [[INDVARS_IV]]
-; CHECK-NEXT: [[L3:%.*]] = load i8, i8* [[L2]], align 4
-; CHECK-NEXT: [[L3E:%.*]] = zext i8 [[L3]] to i32
-; CHECK-NEXT: [[L9]] = add i32 [[SUM_02]], [[L3E]]
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], 256
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: exit:
-; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ [[L9]], [[LOOP]] ], [ [[TMP34]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT: [[RET:%.*]] = trunc i32 [[SUM_0_LCSSA]] to i8
-; CHECK-NEXT: ret i8 [[RET]]
+; CHECK-NEXT: [[TMP37:%.*]] = trunc <vscale x 8 x i32> [[TMP34]] to <vscale x 8 x i8>
+; CHECK-NEXT: [[TMP38:%.*]] = trunc <vscale x 8 x i32> [[TMP36]] to <vscale x 8 x i8>
+; CHECK-NEXT: [[BIN_RDX:%.*]] = add <vscale x 8 x i8> [[TMP38]], [[TMP37]]
+; CHECK-NEXT: [[TMP39:%.*]] = call i8 @llvm.vector.reduce.add.nxv8i8(<vscale x 8 x i8> [[BIN_RDX]])
+; CHECK-NEXT: [[TMP40:%.*]] = zext i8 [[TMP39]] to i32
;
entry:
br label %loop
; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL:%.*]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL:%.*]], i32 0
+; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP4:%.*]] = trunc <vscale x 4 x i32> [[BROADCAST_SPLAT]] to <vscale x 4 x i16>
+; CHECK-NEXT: [[TMP4:%.*]] = trunc <vscale x 4 x i32> [[BROADCAST_SPLAT2]] to <vscale x 4 x i16>
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[HPTR:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[TMP5]] to <vscale x 4 x i16>*
; CHECK-NEXT: store <vscale x 4 x i16> [[TMP4]], <vscale x 4 x i16>* [[TMP6]], align 2
; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[LEN]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[P]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP13:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP13]] to i32
-; CHECK-NEXT: [[CONV17:%.*]] = xor i32 [[CONV]], [[ARG1]]
-; CHECK-NEXT: [[MUL18:%.*]] = mul nuw nsw i32 [[CONV17]], [[CONV]]
-; CHECK-NEXT: [[CONV19:%.*]] = trunc i32 [[MUL18]] to i8
-; CHECK-NEXT: store i8 [[CONV19]], i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[LEN]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK: for.exit:
-; CHECK-NEXT: ret void
-;
+; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
entry:
br label %for.body
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; REQUIRES: asserts
; RUN: opt -loop-vectorize -pass-remarks-analysis=loop-vectorize -debug-only=loop-vectorize -S -scalable-vectorization=on < %s 2>&1 | FileCheck %s
; CHECK: remark: <unknown>:0:0: User-specified vectorization factor vscale x 4 is ignored because the target does not support scalable vectors. The compiler will pick a more suitable value.
; CHECK: LV: The Widest register safe to use is: 32 bits.
define void @test1(i32* %a, i32* %b) {
-; CHECK-LABEL: @test1(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[LOOP:%.*]]
-; CHECK: loop:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[IV]]
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[IV]]
-; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
-; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
-; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[IV]], 4
-; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP2]]
-; CHECK-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX5]], align 4
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: exit:
-; CHECK-NEXT: ret void
-;
entry:
br label %loop
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; REQUIRES: asserts
; RUN: opt -loop-vectorize -force-vector-width=2 -debug-only=loop-vectorize -S -o - < %s 2>&1 | FileCheck %s
; The bitcast below will be scalarized due to the predication in the loop. Bitcasts
; between pointer types should be treated as free, despite the scalarization.
define void @foo(%struct.foo* noalias nocapture %in, i32* noalias nocapture readnone %out, i64 %n) {
-; CHECK-LABEL: @foo(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 2
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
-; CHECK: vector.scevcheck:
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1
-; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr [[STRUCT_FOO:%.*]], %struct.foo* [[IN:%.*]], i64 0, i32 1
-; CHECK-NEXT: [[SCEVGEP1:%.*]] = bitcast i64* [[SCEVGEP]] to %struct.foo*
-; CHECK-NEXT: [[MUL:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 12, i64 [[TMP0]])
-; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i64, i1 } [[MUL]], 0
-; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i64, i1 } [[MUL]], 1
-; CHECK-NEXT: [[SCEVGEP12:%.*]] = bitcast %struct.foo* [[SCEVGEP1]] to i8*
-; CHECK-NEXT: [[TMP1:%.*]] = sub i64 0, [[MUL_RESULT]]
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, i8* [[SCEVGEP12]], i64 [[MUL_RESULT]]
-; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i8* [[TMP2]], [[SCEVGEP12]]
-; CHECK-NEXT: [[TMP4:%.*]] = or i1 [[TMP3]], [[MUL_OVERFLOW]]
-; CHECK-NEXT: br i1 [[TMP4]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ]
-; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_FOO]], %struct.foo* [[IN]], i64 [[TMP5]], i32 1
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_FOO]], %struct.foo* [[IN]], i64 [[TMP6]], i32 1
-; CHECK-NEXT: [[TMP9:%.*]] = bitcast i64* [[TMP7]] to i32*
-; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64* [[TMP8]] to i32*
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [[STRUCT_FOO]], %struct.foo* [[IN]], i64 [[TMP5]], i32 0
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_FOO]], %struct.foo* [[IN]], i64 [[TMP6]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP11]], align 8
-; CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP12]], align 8
-; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x i32> poison, i32 [[TMP13]], i32 0
-; CHECK-NEXT: [[TMP16:%.*]] = insertelement <2 x i32> [[TMP15]], i32 [[TMP14]], i32 1
-; CHECK-NEXT: [[TMP17:%.*]] = icmp eq <2 x i32> [[TMP16]], zeroinitializer
-; CHECK-NEXT: [[TMP18:%.*]] = xor <2 x i1> [[TMP17]], <i1 true, i1 true>
-; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x i1> [[TMP18]], i32 0
-; CHECK-NEXT: br i1 [[TMP19]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
-; CHECK: pred.load.if:
-; CHECK-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP9]], align 4
-; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]]
-; CHECK: pred.load.continue:
-; CHECK-NEXT: [[TMP21:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP20]], [[PRED_LOAD_IF]] ]
-; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i1> [[TMP18]], i32 1
-; CHECK-NEXT: br i1 [[TMP22]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]]
-; CHECK: pred.load.if3:
-; CHECK-NEXT: [[TMP23:%.*]] = load i32, i32* [[TMP10]], align 4
-; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE4]]
-; CHECK: pred.load.continue4:
-; CHECK-NEXT: [[TMP24:%.*]] = phi i32 [ poison, [[PRED_LOAD_CONTINUE]] ], [ [[TMP23]], [[PRED_LOAD_IF3]] ]
-; CHECK-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> poison, i32 [[TMP21]], i32 0
-; CHECK-NEXT: [[TMP26:%.*]] = insertelement <2 x i32> [[TMP25]], i32 [[TMP24]], i32 1
-; CHECK-NEXT: [[TMP27:%.*]] = icmp sgt <2 x i32> [[TMP26]], zeroinitializer
-; CHECK-NEXT: [[TMP28:%.*]] = select <2 x i1> [[TMP18]], <2 x i1> [[TMP27]], <2 x i1> zeroinitializer
-; CHECK-NEXT: [[TMP29:%.*]] = extractelement <2 x i1> [[TMP28]], i32 0
-; CHECK-NEXT: br i1 [[TMP29]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
-; CHECK: pred.store.if:
-; CHECK-NEXT: [[TMP30:%.*]] = add nsw i32 [[TMP21]], -1
-; CHECK-NEXT: store i32 [[TMP30]], i32* [[TMP9]], align 4
-; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]]
-; CHECK: pred.store.continue:
-; CHECK-NEXT: [[TMP31:%.*]] = extractelement <2 x i1> [[TMP28]], i32 1
-; CHECK-NEXT: br i1 [[TMP31]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]]
-; CHECK: pred.store.if5:
-; CHECK-NEXT: [[TMP32:%.*]] = add nsw i32 [[TMP24]], -1
-; CHECK-NEXT: store i32 [[TMP32]], i32* [[TMP10]], align 4
-; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]]
-; CHECK: pred.store.continue6:
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP33]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_SCEVCHECK]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I_012:%.*]] = phi i64 [ [[INC:%.*]], [[IF_END:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT_FOO]], %struct.foo* [[IN]], i64 [[I_012]], i32 1
-; CHECK-NEXT: [[TMP34:%.*]] = bitcast i64* [[B]] to i32*
-; CHECK-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT_FOO]], %struct.foo* [[IN]], i64 [[I_012]], i32 0
-; CHECK-NEXT: [[TMP35:%.*]] = load i32, i32* [[A]], align 8
-; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP35]], 0
-; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END]], label [[LAND_LHS_TRUE:%.*]]
-; CHECK: land.lhs.true:
-; CHECK-NEXT: [[TMP36:%.*]] = load i32, i32* [[TMP34]], align 4
-; CHECK-NEXT: [[CMP2:%.*]] = icmp sgt i32 [[TMP36]], 0
-; CHECK-NEXT: br i1 [[CMP2]], label [[IF_THEN:%.*]], label [[IF_END]]
-; CHECK: if.then:
-; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP36]], -1
-; CHECK-NEXT: store i32 [[SUB]], i32* [[TMP34]], align 4
-; CHECK-NEXT: br label [[IF_END]]
-; CHECK: if.end:
-; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_012]], 1
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
entry:
br label %for.body
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -S < %s | FileCheck %s --check-prefix=CHECK-VF2IC1
; RUN: opt -loop-vectorize -force-vector-interleave=2 -force-vector-width=1 -S < %s | FileCheck %s --check-prefix=CHECK-VF1IC2
define i32 @pred_select_const_i32_from_icmp(i32* noalias nocapture readonly %src1, i32* noalias nocapture readonly %src2, i64 %n) {
; CHECK-VF2IC1-LABEL: @pred_select_const_i32_from_icmp(
-; CHECK-VF2IC1-NEXT: entry:
-; CHECK-VF2IC1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 2
-; CHECK-VF2IC1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-VF2IC1: vector.ph:
-; CHECK-VF2IC1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
-; CHECK-VF2IC1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-VF2IC1-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-VF2IC1: vector.body:
-; CHECK-VF2IC1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE2:%.*]] ]
-; CHECK-VF2IC1-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[PRED_LOAD_CONTINUE2]] ]
-; CHECK-VF2IC1-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-VF2IC1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[SRC1:%.*]], i64 [[TMP0]]
-; CHECK-VF2IC1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; CHECK-VF2IC1-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>*
-; CHECK-VF2IC1-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4
+; CHECK-VF2IC1: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, %vector.ph ], [ [[PREDPHI:%.*]], %pred.load.continue2 ]
+; CHECK-VF2IC1: [[WIDE_LOAD:%.*]] = load <2 x i32>, <2 x i32>* {{%.*}}, align 4
; CHECK-VF2IC1-NEXT: [[TMP4:%.*]] = icmp sgt <2 x i32> [[WIDE_LOAD]], <i32 35, i32 35>
; CHECK-VF2IC1-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
-; CHECK-VF2IC1-NEXT: br i1 [[TMP5]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; CHECK-VF2IC1-NEXT: br i1 [[TMP5]], label %pred.load.if, label %pred.load.continue
; CHECK-VF2IC1: pred.load.if:
-; CHECK-VF2IC1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[SRC2:%.*]], i64 [[TMP0]]
+; CHECK-VF2IC1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[SRC2:%.*]], i64 {{%.*}}
; CHECK-VF2IC1-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4
; CHECK-VF2IC1-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[TMP7]], i32 0
-; CHECK-VF2IC1-NEXT: br label [[PRED_LOAD_CONTINUE]]
+; CHECK-VF2IC1-NEXT: br label %pred.load.continue
; CHECK-VF2IC1: pred.load.continue:
-; CHECK-VF2IC1-NEXT: [[TMP9:%.*]] = phi <2 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP8]], [[PRED_LOAD_IF]] ]
+; CHECK-VF2IC1-NEXT: [[TMP9:%.*]] = phi <2 x i32> [ poison, %vector.body ], [ [[TMP8]], %pred.load.if ]
; CHECK-VF2IC1-NEXT: [[TMP10:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
-; CHECK-VF2IC1-NEXT: br i1 [[TMP10]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2]]
+; CHECK-VF2IC1-NEXT: br i1 [[TMP10]], label %pred.load.if1, label %pred.load.continue2
; CHECK-VF2IC1: pred.load.if1:
-; CHECK-VF2IC1-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 1
-; CHECK-VF2IC1-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[SRC2]], i64 [[TMP11]]
+; CHECK-VF2IC1: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[SRC2]], i64 {{%.*}}
; CHECK-VF2IC1-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4
; CHECK-VF2IC1-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP13]], i32 1
-; CHECK-VF2IC1-NEXT: br label [[PRED_LOAD_CONTINUE2]]
+; CHECK-VF2IC1-NEXT: br label %pred.load.continue2
; CHECK-VF2IC1: pred.load.continue2:
-; CHECK-VF2IC1-NEXT: [[TMP15:%.*]] = phi <2 x i32> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF1]] ]
+; CHECK-VF2IC1-NEXT: [[TMP15:%.*]] = phi <2 x i32> [ [[TMP9]], %pred.load.continue ], [ [[TMP14]], %pred.load.if1 ]
; CHECK-VF2IC1-NEXT: [[TMP16:%.*]] = icmp eq <2 x i32> [[TMP15]], <i32 2, i32 2>
; CHECK-VF2IC1-NEXT: [[TMP17:%.*]] = select <2 x i1> [[TMP16]], <2 x i32> <i32 1, i32 1>, <2 x i32> [[VEC_PHI]]
; CHECK-VF2IC1-NEXT: [[TMP18:%.*]] = xor <2 x i1> [[TMP4]], <i1 true, i1 true>
; CHECK-VF2IC1-NEXT: [[PREDPHI]] = select <2 x i1> [[TMP4]], <2 x i32> [[TMP17]], <2 x i32> [[VEC_PHI]]
-; CHECK-VF2IC1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-VF2IC1-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF2IC1-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-VF2IC1: br i1 {{%.*}}, label %middle.block, label %vector.body
; CHECK-VF2IC1: middle.block:
; CHECK-VF2IC1-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne <2 x i32> [[PREDPHI]], zeroinitializer
; CHECK-VF2IC1-NEXT: [[TMP20:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[RDX_SELECT_CMP]])
; CHECK-VF2IC1-NEXT: [[RDX_SELECT:%.*]] = select i1 [[TMP20]], i32 1, i32 0
-; CHECK-VF2IC1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-VF2IC1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; CHECK-VF2IC1: scalar.ph:
-; CHECK-VF2IC1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-VF2IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF2IC1-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-VF2IC1: [[BC_RESUME_VAL:%.*]] = phi i64 [ {{%.*}}, %middle.block ], [ 0, %entry ]
+; CHECK-VF2IC1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, %entry ], [ [[RDX_SELECT]], %middle.block ]
+; CHECK-VF2IC1-NEXT: br label %for.body
; CHECK-VF2IC1: for.body:
-; CHECK-VF2IC1-NEXT: [[I_013:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-VF2IC1-NEXT: [[R_012:%.*]] = phi i32 [ [[R_1:%.*]], [[FOR_INC]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-VF2IC1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[SRC1]], i64 [[I_013]]
-; CHECK-VF2IC1-NEXT: [[TMP21:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-VF2IC1: [[R_012:%.*]] = phi i32 [ [[R_1:%.*]], %for.inc ], [ [[BC_MERGE_RDX]], %scalar.ph ]
+; CHECK-VF2IC1: [[TMP21:%.*]] = load i32, i32* {{%.*}}, align 4
; CHECK-VF2IC1-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP21]], 35
-; CHECK-VF2IC1-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
+; CHECK-VF2IC1-NEXT: br i1 [[CMP1]], label %if.then, label %for.inc
; CHECK-VF2IC1: if.then:
-; CHECK-VF2IC1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[SRC2]], i64 [[I_013]]
-; CHECK-VF2IC1-NEXT: [[TMP22:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
+; CHECK-VF2IC1: [[TMP22:%.*]] = load i32, i32* {{%.*}}, align 4
; CHECK-VF2IC1-NEXT: [[CMP3:%.*]] = icmp eq i32 [[TMP22]], 2
; CHECK-VF2IC1-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[CMP3]], i32 1, i32 [[R_012]]
-; CHECK-VF2IC1-NEXT: br label [[FOR_INC]]
+; CHECK-VF2IC1-NEXT: br label %for.inc
; CHECK-VF2IC1: for.inc:
-; CHECK-VF2IC1-NEXT: [[R_1]] = phi i32 [ [[R_012]], [[FOR_BODY]] ], [ [[SPEC_SELECT]], [[IF_THEN]] ]
-; CHECK-VF2IC1-NEXT: [[INC]] = add nuw nsw i64 [[I_013]], 1
-; CHECK-VF2IC1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-VF2IC1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-VF2IC1-NEXT: [[R_1]] = phi i32 [ [[R_012]], %for.body ], [ [[SPEC_SELECT]], %if.then ]
; CHECK-VF2IC1: for.end.loopexit:
-; CHECK-VF2IC1-NEXT: [[R_1_LCSSA:%.*]] = phi i32 [ [[R_1]], [[FOR_INC]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
+; CHECK-VF2IC1-NEXT: [[R_1_LCSSA:%.*]] = phi i32 [ [[R_1]], %for.inc ], [ [[RDX_SELECT]], %middle.block ]
; CHECK-VF2IC1-NEXT: ret i32 [[R_1_LCSSA]]
;
; CHECK-VF1IC2-LABEL: @pred_select_const_i32_from_icmp(
-; CHECK-VF1IC2-NEXT: entry:
-; CHECK-VF1IC2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 2
-; CHECK-VF1IC2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-VF1IC2: vector.ph:
-; CHECK-VF1IC2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
-; CHECK-VF1IC2-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-VF1IC2-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-VF1IC2: vector.body:
-; CHECK-VF1IC2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE4:%.*]] ]
-; CHECK-VF1IC2-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[PREDPHI:%.*]], [[PRED_LOAD_CONTINUE4]] ]
-; CHECK-VF1IC2-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[PREDPHI5:%.*]], [[PRED_LOAD_CONTINUE4]] ]
-; CHECK-VF1IC2-NEXT: [[INDUCTION:%.*]] = add i64 [[INDEX]], 0
-; CHECK-VF1IC2-NEXT: [[INDUCTION2:%.*]] = add i64 [[INDEX]], 1
-; CHECK-VF1IC2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[SRC1:%.*]], i64 [[INDUCTION]]
-; CHECK-VF1IC2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[SRC1]], i64 [[INDUCTION2]]
+; CHECK-VF1IC2: [[VEC_PHI:%.*]] = phi i32 [ 0, %vector.ph ], [ [[PREDPHI:%.*]], %pred.load.continue4 ]
+; CHECK-VF1IC2-NEXT: [[VEC_PHI2:%.*]] = phi i32 [ 0, %vector.ph ], [ [[PREDPHI5:%.*]], %pred.load.continue4 ]
+; CHECK-VF1IC2: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[SRC1:%.*]], i64 {{%.*}}
+; CHECK-VF1IC2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[SRC1]], i64 {{%.*}}
; CHECK-VF1IC2-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP0]], align 4
; CHECK-VF1IC2-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1]], align 4
; CHECK-VF1IC2-NEXT: [[TMP4:%.*]] = icmp sgt i32 [[TMP2]], 35
; CHECK-VF1IC2-NEXT: [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], 35
-; CHECK-VF1IC2-NEXT: br i1 [[TMP4]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; CHECK-VF1IC2-NEXT: br i1 [[TMP4]], label %pred.load.if, label %pred.load.continue
; CHECK-VF1IC2: pred.load.if:
-; CHECK-VF1IC2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[SRC2:%.*]], i64 [[INDUCTION]]
+; CHECK-VF1IC2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[SRC2:%.*]], i64 {{%.*}}
; CHECK-VF1IC2-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4
-; CHECK-VF1IC2-NEXT: br label [[PRED_LOAD_CONTINUE]]
+; CHECK-VF1IC2-NEXT: br label %pred.load.continue
; CHECK-VF1IC2: pred.load.continue:
-; CHECK-VF1IC2-NEXT: [[TMP8:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_LOAD_IF]] ]
-; CHECK-VF1IC2-NEXT: br i1 [[TMP5]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4]]
+; CHECK-VF1IC2-NEXT: [[TMP8:%.*]] = phi i32 [ poison, %vector.body ], [ [[TMP7]], %pred.load.if ]
+; CHECK-VF1IC2-NEXT: br i1 [[TMP5]], label %pred.load.if3, label %pred.load.continue4
; CHECK-VF1IC2: pred.load.if3:
-; CHECK-VF1IC2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[SRC2]], i64 [[INDUCTION2]]
+; CHECK-VF1IC2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[SRC2]], i64 {{%.*}}
; CHECK-VF1IC2-NEXT: [[TMP10:%.*]] = load i32, i32* [[TMP9]], align 4
-; CHECK-VF1IC2-NEXT: br label [[PRED_LOAD_CONTINUE4]]
+; CHECK-VF1IC2-NEXT: br label %pred.load.continue4
; CHECK-VF1IC2: pred.load.continue4:
-; CHECK-VF1IC2-NEXT: [[TMP11:%.*]] = phi i32 [ poison, [[PRED_LOAD_CONTINUE]] ], [ [[TMP10]], [[PRED_LOAD_IF3]] ]
+; CHECK-VF1IC2-NEXT: [[TMP11:%.*]] = phi i32 [ poison, %pred.load.continue ], [ [[TMP10]], %pred.load.if3 ]
; CHECK-VF1IC2-NEXT: [[TMP12:%.*]] = icmp eq i32 [[TMP8]], 2
; CHECK-VF1IC2-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP11]], 2
; CHECK-VF1IC2-NEXT: [[TMP14:%.*]] = select i1 [[TMP12]], i32 1, i32 [[VEC_PHI]]
-; CHECK-VF1IC2-NEXT: [[TMP15:%.*]] = select i1 [[TMP13]], i32 1, i32 [[VEC_PHI1]]
+; CHECK-VF1IC2-NEXT: [[TMP15:%.*]] = select i1 [[TMP13]], i32 1, i32 [[VEC_PHI2]]
; CHECK-VF1IC2-NEXT: [[TMP16:%.*]] = xor i1 [[TMP4]], true
; CHECK-VF1IC2-NEXT: [[TMP17:%.*]] = xor i1 [[TMP5]], true
; CHECK-VF1IC2-NEXT: [[PREDPHI]] = select i1 [[TMP4]], i32 [[TMP14]], i32 [[VEC_PHI]]
-; CHECK-VF1IC2-NEXT: [[PREDPHI5]] = select i1 [[TMP5]], i32 [[TMP15]], i32 [[VEC_PHI1]]
-; CHECK-VF1IC2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-VF1IC2-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF1IC2-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-VF1IC2-NEXT: [[PREDPHI5]] = select i1 [[TMP5]], i32 [[TMP15]], i32 [[VEC_PHI2]]
+; CHECK-VF1IC2: br i1 {{%.*}}, label %middle.block, label %vector.body
; CHECK-VF1IC2: middle.block:
; CHECK-VF1IC2-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[PREDPHI]], 0
; CHECK-VF1IC2-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[PREDPHI]], i32 [[PREDPHI5]]
-; CHECK-VF1IC2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-VF1IC2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-VF1IC2: br i1 {{%.*}}, label %for.end.loopexit, label %scalar.ph
; CHECK-VF1IC2: scalar.ph:
-; CHECK-VF1IC2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-VF1IC2-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
-; CHECK-VF1IC2-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-VF1IC2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ {{%.*}}, %middle.block ], [ 0, %entry ]
+; CHECK-VF1IC2-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, %entry ], [ [[RDX_SELECT]], %middle.block ]
+; CHECK-VF1IC2-NEXT: br label %for.body
; CHECK-VF1IC2: for.body:
-; CHECK-VF1IC2-NEXT: [[I_013:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-VF1IC2-NEXT: [[R_012:%.*]] = phi i32 [ [[R_1:%.*]], [[FOR_INC]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-VF1IC2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[SRC1]], i64 [[I_013]]
-; CHECK-VF1IC2-NEXT: [[TMP19:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-VF1IC2-NEXT: [[I_013:%.*]] = phi i64 [ [[INC:%.*]], %for.inc ], [ [[BC_RESUME_VAL]], %scalar.ph ]
+; CHECK-VF1IC2-NEXT: [[R_012:%.*]] = phi i32 [ [[R_1:%.*]], %for.inc ], [ [[BC_MERGE_RDX]], %scalar.ph ]
+; CHECK-VF1IC2: [[TMP19:%.*]] = load i32, i32* {{%.*}}, align 4
; CHECK-VF1IC2-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP19]], 35
-; CHECK-VF1IC2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
+; CHECK-VF1IC2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label %for.inc
; CHECK-VF1IC2: if.then:
-; CHECK-VF1IC2-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[SRC2]], i64 [[I_013]]
-; CHECK-VF1IC2-NEXT: [[TMP20:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
+; CHECK-VF1IC2: [[TMP20:%.*]] = load i32, i32* {{%.*}}, align 4
; CHECK-VF1IC2-NEXT: [[CMP3:%.*]] = icmp eq i32 [[TMP20]], 2
; CHECK-VF1IC2-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[CMP3]], i32 1, i32 [[R_012]]
-; CHECK-VF1IC2-NEXT: br label [[FOR_INC]]
+; CHECK-VF1IC2-NEXT: br label %for.inc
; CHECK-VF1IC2: for.inc:
-; CHECK-VF1IC2-NEXT: [[R_1]] = phi i32 [ [[R_012]], [[FOR_BODY]] ], [ [[SPEC_SELECT]], [[IF_THEN]] ]
-; CHECK-VF1IC2-NEXT: [[INC]] = add nuw nsw i64 [[I_013]], 1
-; CHECK-VF1IC2-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
-; CHECK-VF1IC2-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-VF1IC2-NEXT: [[R_1]] = phi i32 [ [[R_012]], %for.body ], [ [[SPEC_SELECT]], %if.then ]
+; CHECK-VF1IC2: br i1 {{%.*}}, label %for.end.loopexit, label %for.body
; CHECK-VF1IC2: for.end.loopexit:
-; CHECK-VF1IC2-NEXT: [[R_1_LCSSA:%.*]] = phi i32 [ [[R_1]], [[FOR_INC]] ], [ [[RDX_SELECT]], [[MIDDLE_BLOCK]] ]
+; CHECK-VF1IC2-NEXT: [[R_1_LCSSA:%.*]] = phi i32 [ [[R_1]], %for.inc ], [ [[RDX_SELECT]], %middle.block ]
; CHECK-VF1IC2-NEXT: ret i32 [[R_1_LCSSA]]
;
entry:
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], <i64 0, i64 1, i64 2, i64 3>
-; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[VEC_PHI]], <i32 10, i32 10, i32 10, i32 10>
-; CHECK-NEXT: [[TMP2]] = select <4 x i1> [[TMP1]], <4 x i32> [[VEC_PHI]], <4 x i32> <i32 10, i32 10, i32 10, i32 10>
-; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP2]], <4 x i32> [[VEC_PHI]]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i32 0
+; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT3]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[VEC_IV:%.*]] = add <4 x i64> [[BROADCAST_SPLAT4]], <i64 0, i64 1, i64 2, i64 3>
+; CHECK-NEXT: [[TMP1:%.*]] = icmp ule <4 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[VEC_PHI]], <i32 10, i32 10, i32 10, i32 10>
+; CHECK-NEXT: [[TMP3]] = select <4 x i1> [[TMP2]], <4 x i32> [[VEC_PHI]], <4 x i32> <i32 10, i32 10, i32 10, i32 10>
+; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP3]], <4 x i32> [[VEC_PHI]]
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
; CHECK: middle.block:
-; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP3]])
+; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP4]])
; CHECK-NEXT: br i1 true, label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[EXTRA_ITER]], [[LOOP_PREHEADER]] ]
-; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[LOOP_PREHEADER]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[LOOP_PREHEADER]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: loop:
; CHECK-NEXT: [[NEXT:%.*]] = phi i32 [ [[SEL:%.*]], [[LOOP]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
; CHECK-NEXT: [[SEL]] = select i1 [[SEL_COND]], i32 [[NEXT]], i32 10
; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1
; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 0
-; CHECK-NEXT: br i1 [[EC]], label [[EXIT_LOOPEXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT: br i1 [[EC]], label [[EXIT_LOOPEXIT]], label [[LOOP]], [[LOOP2:!llvm.loop !.*]]
; CHECK: exit.loopexit:
-; CHECK-NEXT: [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], [[LOOP]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], [[LOOP]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ]
; CHECK-NEXT: br label [[EXIT]]
; CHECK: exit:
; CHECK-NEXT: [[RESULT:%.*]] = phi i32 [ 0, [[CHECK]] ], [ [[SEL_LCSSA]], [[EXIT_LOOPEXIT]] ]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[INDEX]] to i16
-; CHECK-NEXT: [[TMP1:%.*]] = add i16 [[TMP0]], 0
-; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @src, i16 0, i16 [[TMP1]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i16* [[TMP4]] to <2 x i16>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i16>, <2 x i16>* [[TMP5]], align 1
-; CHECK-NEXT: [[TMP6:%.*]] = icmp sgt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: [[TMP7:%.*]] = xor <2 x i1> [[TMP6]], <i1 true, i1 true>
-; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP6]], <2 x i16> <i16 1, i16 1>, <2 x i16> [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @dst, i16 0, i64 [[TMP2]]
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i16, i16* [[TMP8]], i32 0
-; CHECK-NEXT: [[TMP10:%.*]] = bitcast i16* [[TMP9]] to <2 x i16>*
-; CHECK-NEXT: store <2 x i16> [[PREDPHI]], <2 x i16>* [[TMP10]], align 2
+; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[INDEX]] to i16
+; CHECK-NEXT: [[TMP3:%.*]] = add i16 [[TMP2]], 0
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @src, i16 0, i16 [[TMP3]]
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[TMP4]], i32 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[TMP5]] to <2 x i16>*
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i16>, <2 x i16>* [[TMP6]], align 1
+; CHECK-NEXT: [[TMP7:%.*]] = icmp sgt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i1> [[TMP7]], <i1 true, i1 true>
+; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP7]], <2 x i16> <i16 1, i16 1>, <2 x i16> [[WIDE_LOAD]]
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @dst, i16 0, i64 [[TMP0]]
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, i16* [[TMP9]], i32 0
+; CHECK-NEXT: [[TMP11:%.*]] = bitcast i16* [[TMP10]] to <2 x i16>*
+; CHECK-NEXT: store <2 x i16> [[PREDPHI]], <2 x i16>* [[TMP11]], align 2
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
-; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
+; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 32, 32
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK-NEXT: store i16 [[RES]], i16* [[DST_PTR]], align 2
; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; CHECK-NEXT: [[CMP439:%.*]] = icmp ult i64 [[IV]], 31
-; CHECK-NEXT: br i1 [[CMP439]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT: br i1 [[CMP439]], label [[LOOP_HEADER]], label [[EXIT]], [[LOOP2:!llvm.loop !.*]]
; CHECK: exit:
; CHECK-NEXT: ret void
;
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[INDEX]] to i16
-; CHECK-NEXT: [[TMP1:%.*]] = add i16 [[TMP0]], 0
-; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP3:%.*]] = icmp ugt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr [32 x i16], [32 x i16]* @src, i16 0, i16 [[TMP1]]
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i16, i16* [[TMP4]], i32 0
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[TMP5]] to <2 x i16>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i16>, <2 x i16>* [[TMP6]], align 1
-; CHECK-NEXT: [[TMP7:%.*]] = icmp sgt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: [[TMP8:%.*]] = xor <2 x i1> [[TMP3]], <i1 true, i1 true>
-; CHECK-NEXT: [[TMP9:%.*]] = xor <2 x i1> [[TMP7]], <i1 true, i1 true>
-; CHECK-NEXT: [[TMP10:%.*]] = select <2 x i1> [[TMP3]], <2 x i1> [[TMP9]], <2 x i1> zeroinitializer
-; CHECK-NEXT: [[TMP11:%.*]] = select <2 x i1> [[TMP3]], <2 x i1> [[TMP7]], <2 x i1> zeroinitializer
-; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP10]], <2 x i16> [[WIDE_LOAD]], <2 x i16> zeroinitializer
-; CHECK-NEXT: [[PREDPHI1:%.*]] = select <2 x i1> [[TMP11]], <2 x i16> <i16 1, i16 1>, <2 x i16> [[PREDPHI]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @dst, i16 0, i64 [[TMP2]]
-; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i16, i16* [[TMP12]], i32 0
-; CHECK-NEXT: [[TMP14:%.*]] = bitcast i16* [[TMP13]] to <2 x i16>*
-; CHECK-NEXT: store <2 x i16> [[PREDPHI1]], <2 x i16>* [[TMP14]], align 2
+; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[INDEX]] to i16
+; CHECK-NEXT: [[TMP3:%.*]] = add i16 [[TMP2]], 0
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = icmp ugt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr [32 x i16], [32 x i16]* @src, i16 0, i16 [[TMP3]]
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i16, i16* [[TMP5]], i32 0
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <2 x i16>*
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i16>, <2 x i16>* [[TMP7]], align 1
+; CHECK-NEXT: [[TMP8:%.*]] = icmp sgt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP9:%.*]] = xor <2 x i1> [[TMP4]], <i1 true, i1 true>
+; CHECK-NEXT: [[TMP10:%.*]] = xor <2 x i1> [[TMP8]], <i1 true, i1 true>
+; CHECK-NEXT: [[TMP11:%.*]] = select <2 x i1> [[TMP4]], <2 x i1> [[TMP10]], <2 x i1> zeroinitializer
+; CHECK-NEXT: [[TMP12:%.*]] = select <2 x i1> [[TMP4]], <2 x i1> [[TMP8]], <2 x i1> zeroinitializer
+; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP11]], <2 x i16> [[WIDE_LOAD]], <2 x i16> zeroinitializer
+; CHECK-NEXT: [[PREDPHI1:%.*]] = select <2 x i1> [[TMP12]], <2 x i16> <i16 1, i16 1>, <2 x i16> [[PREDPHI]]
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @dst, i16 0, i64 [[TMP0]]
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i16, i16* [[TMP13]], i32 0
+; CHECK-NEXT: [[TMP15:%.*]] = bitcast i16* [[TMP14]] to <2 x i16>*
+; CHECK-NEXT: store <2 x i16> [[PREDPHI1]], <2 x i16>* [[TMP15]], align 2
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
-; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
-; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
+; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 32, 32
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
-; CHECK: loop.header:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i16
-; CHECK-NEXT: [[CMP_A:%.*]] = icmp ugt i64 [[IV]], [[A]]
-; CHECK-NEXT: br i1 [[CMP_A]], label [[LOOP_COND:%.*]], label [[LOOP_LATCH]]
-; CHECK: loop.cond:
-; CHECK-NEXT: [[BLEND:%.*]] = phi i16 [ [[IV_TRUNC]], [[LOOP_HEADER]] ]
-; CHECK-NEXT: [[SRC_PTR:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @src, i16 0, i16 [[BLEND]]
-; CHECK-NEXT: [[LV:%.*]] = load i16, i16* [[SRC_PTR]], align 1
-; CHECK-NEXT: [[CMP_B:%.*]] = icmp sgt i64 [[IV]], [[A]]
-; CHECK-NEXT: br i1 [[CMP_B]], label [[LOOP_NEXT:%.*]], label [[LOOP_LATCH]]
-; CHECK: loop.next:
-; CHECK-NEXT: br label [[LOOP_LATCH]]
-; CHECK: loop.latch:
-; CHECK-NEXT: [[RES:%.*]] = phi i16 [ 0, [[LOOP_HEADER]] ], [ [[LV]], [[LOOP_COND]] ], [ 1, [[LOOP_NEXT]] ]
-; CHECK-NEXT: [[DST_PTR:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @dst, i16 0, i64 [[IV]]
-; CHECK-NEXT: store i16 [[RES]], i16* [[DST_PTR]], align 2
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[CMP439:%.*]] = icmp ult i64 [[IV]], 31
-; CHECK-NEXT: br i1 [[CMP439]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK: exit:
-; CHECK-NEXT: ret void
;
entry:
br label %loop.header
; CHECK-NEXT: [[VEC_IND1:%.*]] = phi <2 x i16> [ <i16 0, i16 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND3:%.*]] = phi <2 x i16> [ <i16 0, i16 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT4:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: [[TMP2:%.*]] = xor <2 x i1> [[TMP1]], <i1 true, i1 true>
-; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP1]], <2 x i16> [[VEC_IND3]], <2 x i16> [[VEC_IND1]]
-; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i16> [[PREDPHI]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @src, i16 0, i16 [[TMP3]]
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i16> [[PREDPHI]], i32 1
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @src, i16 0, i16 [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = load i16, i16* [[TMP4]], align 1
-; CHECK-NEXT: [[TMP8:%.*]] = load i16, i16* [[TMP6]], align 1
-; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i16> poison, i16 [[TMP7]], i32 0
-; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i16> [[TMP9]], i16 [[TMP8]], i32 1
-; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, i16* [[DST:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, i16* [[TMP11]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast i16* [[TMP12]] to <2 x i16>*
-; CHECK-NEXT: store <2 x i16> [[TMP10]], <2 x i16>* [[TMP13]], align 2
+; CHECK-NEXT: [[TMP2:%.*]] = icmp ugt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP3:%.*]] = xor <2 x i1> [[TMP2]], <i1 true, i1 true>
+; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP2]], <2 x i16> [[VEC_IND3]], <2 x i16> [[VEC_IND1]]
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i16> [[PREDPHI]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @src, i16 0, i16 [[TMP4]]
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i16> [[PREDPHI]], i32 1
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @src, i16 0, i16 [[TMP6]]
+; CHECK-NEXT: [[TMP8:%.*]] = load i16, i16* [[TMP5]], align 1
+; CHECK-NEXT: [[TMP9:%.*]] = load i16, i16* [[TMP7]], align 1
+; CHECK-NEXT: [[INS1:%.+]] = insertelement <2 x i16> poison, i16 [[TMP8]], i32 0
+; CHECK-NEXT: [[INS2:%.+]] = insertelement <2 x i16> [[INS1]], i16 [[TMP9]], i32 1
+; CHECK-NEXT: [[DST0:%.+]] = getelementptr inbounds i16, i16* %dst, i64 [[TMP0]]
+; CHECK-NEXT: [[DST1:%.+]] = getelementptr inbounds i16, i16* [[DST0]], i32 0
+; CHECK-NEXT: [[DST1_BC:%.+]] = bitcast i16* [[DST1]] to <2 x i16>*
+; CHECK-NEXT: store <2 x i16> [[INS2]], <2 x i16>* [[DST1_BC]], align 2
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
; CHECK-NEXT: [[VEC_IND_NEXT2]] = add <2 x i16> [[VEC_IND1]], <i16 2, i16 2>
; CHECK-NEXT: [[VEC_IND_NEXT4]] = add <2 x i16> [[VEC_IND3]], <i16 2, i16 2>
-; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
-; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
+; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 32, 32
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
-; CHECK: loop.header:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i16
-; CHECK-NEXT: [[IV_TRUNC_2:%.*]] = trunc i64 [[IV]] to i16
-; CHECK-NEXT: [[CMP_A:%.*]] = icmp ugt i64 [[IV]], [[A]]
-; CHECK-NEXT: br i1 [[CMP_A]], label [[LOOP_NEXT:%.*]], label [[LOOP_LATCH]]
-; CHECK: loop.next:
-; CHECK-NEXT: br label [[LOOP_LATCH]]
-; CHECK: loop.latch:
-; CHECK-NEXT: [[BLEND:%.*]] = phi i16 [ [[IV_TRUNC]], [[LOOP_HEADER]] ], [ [[IV_TRUNC_2]], [[LOOP_NEXT]] ]
-; CHECK-NEXT: [[SRC_PTR:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @src, i16 0, i16 [[BLEND]]
-; CHECK-NEXT: [[LV:%.*]] = load i16, i16* [[SRC_PTR]], align 1
-; CHECK-NEXT: [[DST_PTR:%.*]] = getelementptr inbounds i16, i16* [[DST]], i64 [[IV]]
-; CHECK-NEXT: store i16 [[LV]], i16* [[DST_PTR]], align 2
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[CMP439:%.*]] = icmp ult i64 [[IV]], 31
-; CHECK-NEXT: br i1 [[CMP439]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK: exit:
-; CHECK-NEXT: ret void
;
entry:
br label %loop.header
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE4]] ]
; CHECK-NEXT: [[VEC_IND1:%.*]] = phi <2 x i16> [ <i16 0, i16 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[PRED_LOAD_CONTINUE4]] ]
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
-; CHECK-NEXT: br i1 [[TMP2]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
+; CHECK-NEXT: [[TMP2:%.*]] = icmp ugt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
+; CHECK-NEXT: br i1 [[TMP3]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]]
; CHECK: pred.load.if:
-; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i16> [[VEC_IND1]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @src, i16 0, i16 [[TMP3]]
-; CHECK-NEXT: [[TMP5:%.*]] = load i16, i16* [[TMP4]], align 1
-; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i16> poison, i16 [[TMP5]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i16> [[VEC_IND1]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @src, i16 0, i16 [[TMP4]]
+; CHECK-NEXT: [[TMP6:%.*]] = load i16, i16* [[TMP5]], align 1
+; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i16> poison, i16 [[TMP6]], i32 0
; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE]]
; CHECK: pred.load.continue:
-; CHECK-NEXT: [[TMP7:%.*]] = phi <2 x i16> [ poison, [[VECTOR_BODY]] ], [ [[TMP6]], [[PRED_LOAD_IF]] ]
-; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
-; CHECK-NEXT: br i1 [[TMP8]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4]]
+; CHECK-NEXT: [[TMP8:%.*]] = phi <2 x i16> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_LOAD_IF]] ]
+; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
+; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4]]
; CHECK: pred.load.if3:
-; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i16> [[VEC_IND1]], i32 1
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @src, i16 0, i16 [[TMP9]]
-; CHECK-NEXT: [[TMP11:%.*]] = load i16, i16* [[TMP10]], align 1
-; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i16> [[TMP7]], i16 [[TMP11]], i32 1
+; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i16> [[VEC_IND1]], i32 1
+; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @src, i16 0, i16 [[TMP10]]
+; CHECK-NEXT: [[TMP12:%.*]] = load i16, i16* [[TMP11]], align 1
+; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i16> [[TMP8]], i16 [[TMP12]], i32 1
; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE4]]
; CHECK: pred.load.continue4:
-; CHECK-NEXT: [[TMP13:%.*]] = phi <2 x i16> [ [[TMP7]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP12]], [[PRED_LOAD_IF3]] ]
-; CHECK-NEXT: [[TMP14:%.*]] = icmp sgt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT: [[TMP15:%.*]] = xor <2 x i1> [[TMP1]], <i1 true, i1 true>
-; CHECK-NEXT: [[TMP16:%.*]] = xor <2 x i1> [[TMP14]], <i1 true, i1 true>
-; CHECK-NEXT: [[TMP17:%.*]] = select <2 x i1> [[TMP1]], <2 x i1> [[TMP16]], <2 x i1> zeroinitializer
-; CHECK-NEXT: [[TMP18:%.*]] = select <2 x i1> [[TMP1]], <2 x i1> [[TMP14]], <2 x i1> zeroinitializer
-; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP17]], <2 x i16> [[TMP13]], <2 x i16> zeroinitializer
-; CHECK-NEXT: [[PREDPHI5:%.*]] = select <2 x i1> [[TMP18]], <2 x i16> <i16 1, i16 1>, <2 x i16> [[PREDPHI]]
-; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @dst, i16 0, i64 [[TMP0]]
-; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i16, i16* [[TMP19]], i32 0
-; CHECK-NEXT: [[TMP21:%.*]] = bitcast i16* [[TMP20]] to <2 x i16>*
-; CHECK-NEXT: store <2 x i16> [[PREDPHI5]], <2 x i16>* [[TMP21]], align 2
+; CHECK-NEXT: [[TMP14:%.*]] = phi <2 x i16> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP13]], [[PRED_LOAD_IF3]] ]
+; CHECK-NEXT: [[TMP15:%.*]] = icmp sgt <2 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT: [[TMP16:%.*]] = xor <2 x i1> [[TMP2]], <i1 true, i1 true>
+; CHECK-NEXT: [[TMP17:%.*]] = xor <2 x i1> [[TMP15]], <i1 true, i1 true>
+; CHECK-NEXT: [[TMP18:%.*]] = select <2 x i1> [[TMP2]], <2 x i1> [[TMP17]], <2 x i1> zeroinitializer
+; CHECK-NEXT: [[TMP19:%.*]] = select <2 x i1> [[TMP2]], <2 x i1> [[TMP15]], <2 x i1> zeroinitializer
+; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP18]], <2 x i16> [[TMP14]], <2 x i16> zeroinitializer
+; CHECK-NEXT: [[PREDPHI5:%.*]] = select <2 x i1> [[TMP19]], <2 x i16> <i16 1, i16 1>, <2 x i16> [[PREDPHI]]
+; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @dst, i16 0, i64 [[TMP0]]
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i16, i16* [[TMP20]], i32 0
+; CHECK-NEXT: [[TMP22:%.*]] = bitcast i16* [[TMP21]] to <2 x i16>*
+; CHECK-NEXT: store <2 x i16> [[PREDPHI5]], <2 x i16>* [[TMP22]], align 2
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
; CHECK-NEXT: [[VEC_IND_NEXT2]] = add <2 x i16> [[VEC_IND1]], <i16 2, i16 2>
-; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
-; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], 64
+; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 64, 64
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 64, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
-; CHECK: loop.header:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i16
-; CHECK-NEXT: [[CMP_A:%.*]] = icmp ugt i64 [[IV]], [[A]]
-; CHECK-NEXT: br i1 [[CMP_A]], label [[LOOP_COND:%.*]], label [[LOOP_LATCH]]
-; CHECK: loop.cond:
-; CHECK-NEXT: [[BLEND:%.*]] = phi i16 [ [[IV_TRUNC]], [[LOOP_HEADER]] ]
-; CHECK-NEXT: [[SRC_PTR:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @src, i16 0, i16 [[BLEND]]
-; CHECK-NEXT: [[LV:%.*]] = load i16, i16* [[SRC_PTR]], align 1
-; CHECK-NEXT: [[CMP_B:%.*]] = icmp sgt i64 [[IV]], [[A]]
-; CHECK-NEXT: br i1 [[CMP_B]], label [[LOOP_NEXT:%.*]], label [[LOOP_LATCH]]
-; CHECK: loop.next:
-; CHECK-NEXT: br label [[LOOP_LATCH]]
-; CHECK: loop.latch:
-; CHECK-NEXT: [[RES:%.*]] = phi i16 [ 0, [[LOOP_HEADER]] ], [ [[LV]], [[LOOP_COND]] ], [ 1, [[LOOP_NEXT]] ]
-; CHECK-NEXT: [[DST_PTR:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @dst, i16 0, i64 [[IV]]
-; CHECK-NEXT: store i16 [[RES]], i16* [[DST_PTR]], align 2
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[CMP439:%.*]] = icmp ult i64 [[IV]], 63
-; CHECK-NEXT: br i1 [[CMP439]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK: exit:
-; CHECK-NEXT: ret void
;
entry:
br label %loop.header
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i32> [[VEC_IND]], i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i32 [[TMP0]]
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, i32* [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <2 x i32>*
-; CHECK-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP3]], align 4
+; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[VEC_IND]], i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, i32* [[PTR:%.*]], i32 [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i32, i32* [[TMP2]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <2 x i32>*
+; CHECK-NEXT: store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP4]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
-; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
+; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP10:!llvm.loop !.*]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 1000, 1000
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
-; CHECK: loop.header:
-; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[ADD_I:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT: [[C_0:%.*]] = icmp ugt i32 [[IV]], [[X:%.*]]
-; CHECK-NEXT: br i1 [[C_0]], label [[LOOP_LATCH]], label [[LOOP_LATCH]]
-; CHECK: loop.latch:
-; CHECK-NEXT: [[P:%.*]] = phi i32 [ [[IV]], [[LOOP_HEADER]] ], [ [[IV]], [[LOOP_HEADER]] ]
-; CHECK-NEXT: [[GEP_PTR:%.*]] = getelementptr i32, i32* [[PTR]], i32 [[P]]
-; CHECK-NEXT: store i32 [[P]], i32* [[GEP_PTR]], align 4
-; CHECK-NEXT: [[ADD_I]] = add nsw i32 [[P]], 1
-; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[ADD_I]], 1000
-; CHECK-NEXT: br i1 [[CMP]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK: exit:
-; CHECK-NEXT: ret void
;
entry:
br label %loop.header
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -basic-aa -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine < %s | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
; a shuffle too many.
define void @t() {
-; CHECK-LABEL: @t(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = sub i64 94, [[INDEX]]
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* @uf, i64 0, i64 [[TMP0]]
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* @xi, i64 0, i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i64 -3
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 -3
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4
-; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
-; CHECK-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4
-; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* @q, i64 0, i64 [[TMP0]]
-; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i64 -3
-; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4
-; CHECK-NEXT: [[TMP12:%.*]] = add nsw <4 x i32> [[TMP7]], [[WIDE_LOAD4]]
-; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
-; CHECK-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* [[TMP13]], align 4
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 88
-; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 5, [[MIDDLE_BLOCK]] ], [ 93, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* @uf, i64 0, i64 [[TMP15]]
-; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* @xi, i64 0, i64 [[TMP15]]
-; CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4
-; CHECK-NEXT: [[TMP17:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP17]], [[TMP16]]
-; CHECK-NEXT: store i32 [[ADD4]], i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [100 x i32], [100 x i32]* @q, i64 0, i64 [[TMP15]]
-; CHECK-NEXT: [[TMP18:%.*]] = load i32, i32* [[ARRAYIDX7]], align 4
-; CHECK-NEXT: [[ADD8:%.*]] = add nsw i32 [[ADD4]], [[TMP18]]
-; CHECK-NEXT: store i32 [[ADD8]], i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], -1
-; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
-; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i32 [[TMP19]], 2
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
entry:
br label %for.body
+; CHECK-LABEL: @t(
+; CHECK: vector.body:
+; CHECK: [[VAR1:%[a-zA-Z0-9.]+]] = load <4 x i32>
+; CHECK: [[VAR2:%[a-zA-Z0-9.]+]] = load <4 x i32>
+; CHECK: [[VAR3:%[a-zA-Z0-9]+]] = add nsw <4 x i32> [[VAR2]], [[VAR1]]
+; CHECK: store <4 x i32> [[VAR3]]
+; CHECK: [[VAR4:%[a-zA-Z0-9.]+]] = load <4 x i32>
+; CHECK: add nsw <4 x i32> [[VAR3]], [[VAR4]]
+; CHECK-NOT: shufflevector
for.body:
%indvars.iv = phi i64 [ 93, %entry ], [ %indvars.iv.next, %for.body ]
; CHECK: vector.ph:
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE12:%.*]] ]
-; CHECK-NEXT: [[VEC_IV:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[VEC_IV4:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT: [[VEC_IV5:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT: [[VEC_IV6:%.*]] = add i64 [[INDEX]], 3
-; CHECK-NEXT: [[TMP0:%.*]] = icmp ule i64 [[VEC_IV]], 14
-; CHECK-NEXT: [[TMP1:%.*]] = icmp ule i64 [[VEC_IV4]], 14
-; CHECK-NEXT: [[TMP2:%.*]] = icmp ule i64 [[VEC_IV5]], 14
-; CHECK-NEXT: [[TMP3:%.*]] = icmp ule i64 [[VEC_IV6]], 14
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE9:%.*]] ]
+; CHECK-NEXT: [[INDUCTION:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[INDUCTION1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT: [[INDUCTION2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT: [[INDUCTION3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT: [[TMP0:%.*]] = icmp ule i64 [[INDUCTION]], 14
+; CHECK-NEXT: [[TMP1:%.*]] = icmp ule i64 [[INDUCTION1]], 14
+; CHECK-NEXT: [[TMP2:%.*]] = icmp ule i64 [[INDUCTION2]], 14
+; CHECK-NEXT: [[TMP3:%.*]] = icmp ule i64 [[INDUCTION3]], 14
; CHECK-NEXT: br i1 [[TMP0]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
; CHECK: pred.store.if:
-; CHECK-NEXT: [[INDUCTION:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 [[INDUCTION]]
+; CHECK-NEXT: [[SUNK_IND0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 [[SUNK_IND0]]
; CHECK-NEXT: store i32 0, i32* [[TMP4]], align 4
; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]]
; CHECK: pred.store.continue:
-; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]]
+; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_STORE_IF4:%.*]], label [[PRED_STORE_CONTINUE5:%.*]]
; CHECK: pred.store.if7:
-; CHECK-NEXT: [[INDUCTION1:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 [[INDUCTION1]]
+; CHECK-NEXT: [[SUNK_IND1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 [[SUNK_IND1]]
; CHECK-NEXT: store i32 0, i32* [[TMP5]], align 4
-; CHECK-NEXT: br label [[PRED_STORE_CONTINUE8]]
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE5]]
; CHECK: pred.store.continue8:
-; CHECK-NEXT: br i1 [[TMP2]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
+; CHECK-NEXT: br i1 [[TMP2]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7:%.*]]
; CHECK: pred.store.if9:
-; CHECK-NEXT: [[INDUCTION2:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 [[INDUCTION2]]
+; CHECK-NEXT: [[SUNK_IND2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 [[SUNK_IND2]]
; CHECK-NEXT: store i32 0, i32* [[TMP6]], align 4
-; CHECK-NEXT: br label [[PRED_STORE_CONTINUE10]]
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE7]]
; CHECK: pred.store.continue10:
-; CHECK-NEXT: br i1 [[TMP3]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12]]
+; CHECK-NEXT: br i1 [[TMP3]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9]]
; CHECK: pred.store.if11:
-; CHECK-NEXT: [[INDUCTION3:%.*]] = add i64 [[INDEX]], 3
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 [[INDUCTION3]]
+; CHECK-NEXT: [[SUNK_IND3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 [[SUNK_IND3]]
; CHECK-NEXT: store i32 0, i32* [[TMP7]], align 4
-; CHECK-NEXT: br label [[PRED_STORE_CONTINUE12]]
+; CHECK-NEXT: br label [[PRED_STORE_CONTINUE9]]
; CHECK: pred.store.continue12:
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -tbaa -basic-aa -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -S | FileCheck %s
; RUN: opt < %s -basic-aa -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -simplifycfg -simplifycfg-require-and-preserve-domtree=1 -S | FileCheck %s --check-prefix=CHECK-NOTBAA
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
; TBAA partitions the accesses in this loop, so it can be vectorized without
; runtime checks.
define i32 @test1(i32* nocapture %a, float* nocapture readonly %b) {
-; CHECK-LABEL: @test1(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[TMP0]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; CHECK-NEXT: [[TMP2:%.*]] = fptosi <4 x float> [[WIDE_LOAD]] to <4 x i32>
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
-; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP4]], align 4, !tbaa [[TBAA4:![0-9]+]]
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1600
-; CHECK-NEXT: br i1 [[TMP5]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret i32 0
-;
-; CHECK-NOTBAA-LABEL: @test1(
-; CHECK-NOTBAA-NEXT: entry:
-; CHECK-NOTBAA-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 1600
-; CHECK-NOTBAA-NEXT: [[SCEVGEP4:%.*]] = getelementptr float, float* [[B:%.*]], i64 1600
-; CHECK-NOTBAA-NEXT: [[TMP0:%.*]] = bitcast float* [[SCEVGEP4]] to i32*
-; CHECK-NOTBAA-NEXT: [[BOUND0:%.*]] = icmp ugt i32* [[TMP0]], [[A]]
-; CHECK-NOTBAA-NEXT: [[TMP1:%.*]] = bitcast i32* [[SCEVGEP]] to float*
-; CHECK-NOTBAA-NEXT: [[BOUND1:%.*]] = icmp ugt float* [[TMP1]], [[B]]
-; CHECK-NOTBAA-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NOTBAA-NEXT: br i1 [[FOUND_CONFLICT]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]]
-; CHECK-NOTBAA: vector.body:
-; CHECK-NOTBAA-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NOTBAA-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDEX]]
-; CHECK-NOTBAA-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>*
-; CHECK-NOTBAA-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4, !tbaa [[TBAA0:![0-9]+]], !alias.scope !4
-; CHECK-NOTBAA-NEXT: [[TMP4:%.*]] = fptosi <4 x float> [[WIDE_LOAD]] to <4 x i32>
-; CHECK-NOTBAA-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDEX]]
-; CHECK-NOTBAA-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
-; CHECK-NOTBAA-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP6]], align 4, !tbaa [[TBAA7:![0-9]+]], !alias.scope !9, !noalias !4
-; CHECK-NOTBAA-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NOTBAA-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1600
-; CHECK-NOTBAA-NEXT: br i1 [[TMP7]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK-NOTBAA: for.body:
-; CHECK-NOTBAA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
-; CHECK-NOTBAA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NOTBAA-NEXT: [[TMP8:%.*]] = load float, float* [[ARRAYIDX]], align 4, !tbaa [[TBAA0]]
-; CHECK-NOTBAA-NEXT: [[CONV:%.*]] = fptosi float [[TMP8]] to i32
-; CHECK-NOTBAA-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NOTBAA-NEXT: store i32 [[CONV]], i32* [[ARRAYIDX2]], align 4, !tbaa [[TBAA7]]
-; CHECK-NOTBAA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NOTBAA-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1600
-; CHECK-NOTBAA-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK-NOTBAA: for.end:
-; CHECK-NOTBAA-NEXT: ret i32 0
-;
+; CHECK-LABEL: @test1
+; CHECK: entry:
+; CHECK-NEXT: br label %vector.body
+; CHECK: vector.body:
+; CHECK: load <4 x float>, <4 x float>* %{{.*}}, align 4, !tbaa
+; CHECK: store <4 x i32> %{{.*}}, <4 x i32>* %{{.*}}, align 4, !tbaa
+; CHECK: ret i32 0
+; CHECK-NOTBAA-LABEL: @test1
+; CHECK-NOTBAA: entry:
+; CHECK-NOTBAA: icmp ugt i32*
+; CHECK-NOTBAA: icmp ugt float*
+; CHECK-NOTBAA-NOT: icmp
+; CHECK-NOTBAA: br i1 {{.+}}, label %for.body, label %vector.body
+; CHECK-NOTBAA: load <4 x float>, <4 x float>* %{{.*}}, align 4, !tbaa
+; CHECK-NOTBAA: store <4 x i32> %{{.*}}, <4 x i32>* %{{.*}}, align 4, !tbaa
+; CHECK-NOTBAA: ret i32 0
entry:
br label %for.body
; This test is like the first, except here there is still one runtime check
; required. Without TBAA, however, two checks are required.
define i32 @test2(i32* nocapture readonly %a, float* nocapture readonly %b, float* nocapture %c) {
-; CHECK-LABEL: @test2(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[C:%.*]], i64 1600
-; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr float, float* [[B:%.*]], i64 1600
-; CHECK-NEXT: [[BOUND0:%.*]] = icmp ugt float* [[SCEVGEP4]], [[C]]
-; CHECK-NEXT: [[BOUND1:%.*]] = icmp ugt float* [[SCEVGEP]], [[B]]
-; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[TMP0]] to <4 x float>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4, !tbaa [[TBAA0]], !alias.scope !8
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
-; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT: [[TMP4:%.*]] = sitofp <4 x i32> [[WIDE_LOAD6]] to <4 x float>
-; CHECK-NEXT: [[TMP5:%.*]] = fmul <4 x float> [[WIDE_LOAD]], [[TMP4]]
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[C]], i64 [[INDEX]]
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[TMP6]] to <4 x float>*
-; CHECK-NEXT: store <4 x float> [[TMP5]], <4 x float>* [[TMP7]], align 4, !tbaa [[TBAA0]], !alias.scope !11, !noalias !8
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1600
-; CHECK-NEXT: br i1 [[TMP8]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
-; CHECK: for.body:
-; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP9:%.*]] = load float, float* [[ARRAYIDX]], align 4, !tbaa [[TBAA0]]
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4, !tbaa [[TBAA4]]
-; CHECK-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP10]] to float
-; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP9]], [[CONV]]
-; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[C]], i64 [[INDVARS_IV]]
-; CHECK-NEXT: store float [[MUL]], float* [[ARRAYIDX4]], align 4, !tbaa [[TBAA0]]
-; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1600
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret i32 0
-;
-; CHECK-NOTBAA-LABEL: @test2(
-; CHECK-NOTBAA-NEXT: entry:
-; CHECK-NOTBAA-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[C:%.*]], i64 1600
-; CHECK-NOTBAA-NEXT: [[SCEVGEP4:%.*]] = getelementptr float, float* [[B:%.*]], i64 1600
-; CHECK-NOTBAA-NEXT: [[SCEVGEP7:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 1600
-; CHECK-NOTBAA-NEXT: [[BOUND0:%.*]] = icmp ugt float* [[SCEVGEP4]], [[C]]
-; CHECK-NOTBAA-NEXT: [[BOUND1:%.*]] = icmp ugt float* [[SCEVGEP]], [[B]]
-; CHECK-NOTBAA-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NOTBAA-NEXT: [[TMP0:%.*]] = bitcast i32* [[SCEVGEP7]] to float*
-; CHECK-NOTBAA-NEXT: [[BOUND09:%.*]] = icmp ugt float* [[TMP0]], [[C]]
-; CHECK-NOTBAA-NEXT: [[TMP1:%.*]] = bitcast float* [[SCEVGEP]] to i32*
-; CHECK-NOTBAA-NEXT: [[BOUND110:%.*]] = icmp ugt i32* [[TMP1]], [[A]]
-; CHECK-NOTBAA-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]]
-; CHECK-NOTBAA-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]]
-; CHECK-NOTBAA-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]]
-; CHECK-NOTBAA: vector.body:
-; CHECK-NOTBAA-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NOTBAA-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDEX]]
-; CHECK-NOTBAA-NEXT: [[TMP3:%.*]] = bitcast float* [[TMP2]] to <4 x float>*
-; CHECK-NOTBAA-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>* [[TMP3]], align 4, !tbaa [[TBAA0]], !alias.scope !14
-; CHECK-NOTBAA-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDEX]]
-; CHECK-NOTBAA-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>*
-; CHECK-NOTBAA-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4, !tbaa [[TBAA7]], !alias.scope !17
-; CHECK-NOTBAA-NEXT: [[TMP6:%.*]] = sitofp <4 x i32> [[WIDE_LOAD12]] to <4 x float>
-; CHECK-NOTBAA-NEXT: [[TMP7:%.*]] = fmul <4 x float> [[WIDE_LOAD]], [[TMP6]]
-; CHECK-NOTBAA-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[C]], i64 [[INDEX]]
-; CHECK-NOTBAA-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
-; CHECK-NOTBAA-NEXT: store <4 x float> [[TMP7]], <4 x float>* [[TMP9]], align 4, !tbaa [[TBAA0]], !alias.scope !19, !noalias !21
-; CHECK-NOTBAA-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NOTBAA-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1600
-; CHECK-NOTBAA-NEXT: br i1 [[TMP10]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
-; CHECK-NOTBAA: for.body:
-; CHECK-NOTBAA-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
-; CHECK-NOTBAA-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]]
-; CHECK-NOTBAA-NEXT: [[TMP11:%.*]] = load float, float* [[ARRAYIDX]], align 4, !tbaa [[TBAA0]]
-; CHECK-NOTBAA-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; CHECK-NOTBAA-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4, !tbaa [[TBAA7]]
-; CHECK-NOTBAA-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP12]] to float
-; CHECK-NOTBAA-NEXT: [[MUL:%.*]] = fmul float [[TMP11]], [[CONV]]
-; CHECK-NOTBAA-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, float* [[C]], i64 [[INDVARS_IV]]
-; CHECK-NOTBAA-NEXT: store float [[MUL]], float* [[ARRAYIDX4]], align 4, !tbaa [[TBAA0]]
-; CHECK-NOTBAA-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NOTBAA-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1600
-; CHECK-NOTBAA-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
-; CHECK-NOTBAA: for.end:
-; CHECK-NOTBAA-NEXT: ret i32 0
-;
-
-
-
-
-
+; CHECK-LABEL: @test2
+; CHECK: entry:
+; CHECK: icmp ugt float*
+; CHECK: icmp ugt float*
+; CHECK-NOT: icmp uge i32*
+; CHECK: br i1 {{.+}}, label %for.body, label %vector.body
+
+; CHECK: load <4 x float>, <4 x float>* %{{.*}}, align 4, !tbaa
+; CHECK: store <4 x float> %{{.*}}, <4 x float>* %{{.*}}, align 4, !tbaa
+
+; CHECK: ret i32 0
+
+; CHECK-NOTBAA-LABEL: @test2
+; CHECK-NOTBAA: entry:
+; CHECK-NOTBAA: icmp ugt float*
+; CHECK-NOTBAA: icmp ugt float*
+; CHECK-NOTBAA-DAG: icmp ugt float*
+; CHECK-NOTBAA-DAG: icmp ugt i32*
+; CHECK-NOTBAA-NOT: icmp
+; CHECK-NOTBAA: br i1 {{.+}}, label %for.body, label %vector.body
+
+; CHECK-NOTBAA: load <4 x float>, <4 x float>* %{{.*}}, align 4, !tbaa
+; CHECK-NOTBAA: store <4 x float> %{{.*}}, <4 x float>* %{{.*}}, align 4, !tbaa
+
+; CHECK-NOTBAA: ret i32 0
entry:
br label %for.body
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; This test verifies that the loop vectorizer will not vectorizes low trip count
; loops that require runtime checks (Trip count is computed with profile info).
; REQUIRES: asserts
define i32 @foo_low_trip_count1(i32 %bound) {
; Simple loop with low tripcount. Should not be vectorized.
+
; CHECK-LABEL: @foo_low_trip_count1(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]]
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP0]], 0
-; CHECK-NEXT: [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1
-; CHECK-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[INC]] = add nsw i32 [[I_08]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], [[BOUND:%.*]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !prof [[PROF0:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret i32 0
-;
+; CHECK-NOT: <{{[0-9]+}} x i8>
entry:
br label %for.body
define i32 @foo_low_trip_count2(i32 %bound) !prof !0 {
; The loop has a same invocation count with the function, but has a low
; trip_count per invocation and not worth to vectorize.
+
; CHECK-LABEL: @foo_low_trip_count2(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]]
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP0]], 0
-; CHECK-NEXT: [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1
-; CHECK-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[INC]] = add nsw i32 [[I_08]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], [[BOUND:%.*]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !prof [[PROF0]]
-; CHECK: for.end:
-; CHECK-NEXT: ret i32 0
-;
+; CHECK-NOT: <{{[0-9]+}} x i8>
entry:
br label %for.body
define i32 @foo_low_trip_count3(i1 %cond, i32 %bound) !prof !0 {
; The loop has low invocation count compare to the function invocation count,
; but has a high trip count per invocation. Vectorize it.
+
; CHECK-LABEL: @foo_low_trip_count3(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 [[COND:%.*]], label [[FOR_PREHEADER:%.*]], label [[FOR_END:%.*]], !prof [[PROF2:![0-9]+]]
-; CHECK: for.preheader:
-; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[BOUND:%.*]], 1
-; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 4
-; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP0]], 4
-; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP0]], [[N_MOD_VF]]
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP1]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <4 x i8>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP4]], align 1
-; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i8> <i8 2, i8 2, i8 2, i8 2>, <4 x i8> <i8 1, i8 1, i8 1, i8 1>
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP3]] to <4 x i8>*
-; CHECK-NEXT: store <4 x i8> [[TMP6]], <4 x i8>* [[TMP7]], align 1
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF3:![0-9]+]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_PREHEADER]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]]
-; CHECK-NEXT: [[TMP9:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP9]], 0
-; CHECK-NEXT: [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1
-; CHECK-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[INC]] = add nsw i32 [[I_08]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[I_08]], [[BOUND]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK: for.end.loopexit:
-; CHECK-NEXT: br label [[FOR_END]]
-; CHECK: for.end:
-; CHECK-NEXT: ret i32 0
-;
+; CHECK: [[VECTOR_BODY:vector\.body]]:
+; CHECK: br i1 [[TMP9:%.*]], label [[MIDDLE_BLOCK:%.*]], label %[[VECTOR_BODY]], !prof [[LP3:\!.*]],
+; CHECK: [[FOR_BODY:for\.body]]:
+; CHECK: br i1 [[EXITCOND:%.*]], label [[FOR_END_LOOPEXIT:%.*]], label %[[FOR_BODY]], !prof [[LP6:\!.*]],
entry:
br i1 %cond, label %for.preheader, label %for.end, !prof !2
define i32 @foo_low_trip_count_icmp_sgt(i32 %bound) {
; Simple loop with low tripcount and inequality test for exit.
; Should not be vectorized.
+
; CHECK-LABEL: @foo_low_trip_count_icmp_sgt(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]]
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP0]], 0
-; CHECK-NEXT: [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1
-; CHECK-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[INC]] = add nsw i32 [[I_08]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp sgt i32 [[I_08]], [[BOUND:%.*]]
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]], !prof [[PROF0]]
-; CHECK: for.end:
-; CHECK-NEXT: ret i32 0
-;
+; CHECK-NOT: <{{[0-9]+}} x i8>
entry:
br label %for.body
define i32 @const_low_trip_count() {
; Simple loop with constant, small trip count and no profiling info.
-; CHECK-LABEL: @const_low_trip_count(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]]
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP0]], 0
-; CHECK-NEXT: [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1
-; CHECK-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[INC]] = add nsw i32 [[I_08]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp slt i32 [[I_08]], 2
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END:%.*]]
-; CHECK: for.end:
-; CHECK-NEXT: ret i32 0
-;
+
+; CHECK-LABEL: @const_low_trip_count
+; CHECK-NOT: <{{[0-9]+}} x i8>
entry:
br label %for.body
define i32 @const_large_trip_count() {
; Simple loop with constant large trip count and no profiling info.
-; CHECK-LABEL: @const_large_trip_count(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]]
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 1
-; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[TMP4]], <4 x i8> <i8 2, i8 2, i8 2, i8 2>, <4 x i8> <i8 1, i8 1, i8 1, i8 1>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>*
-; CHECK-NEXT: store <4 x i8> [[TMP5]], <4 x i8>* [[TMP6]], align 1
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 1001, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]]
-; CHECK-NEXT: [[TMP8:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP8]], 0
-; CHECK-NEXT: [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1
-; CHECK-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[INC]] = add nsw i32 [[I_08]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp slt i32 [[I_08]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret i32 0
-;
+
+; CHECK-LABEL: @const_large_trip_count
+; CHECK: <{{[0-9]+}} x i8>
entry:
br label %for.body
define i32 @const_small_trip_count_step() {
; Simple loop with static, small trip count and no profiling info.
-; CHECK-LABEL: @const_small_trip_count_step(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]]
-; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP0]], 0
-; CHECK-NEXT: [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1
-; CHECK-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[INC]] = add nsw i32 [[I_08]], 5
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp slt i32 [[I_08]], 10
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END:%.*]]
-; CHECK: for.end:
-; CHECK-NEXT: ret i32 0
-;
+
+; CHECK-LABEL: @const_small_trip_count_step
+; CHECK-NOT: <{{[0-9]+}} x i8>
entry:
br label %for.body
define i32 @const_trip_over_profile() {
; constant trip count takes precedence over profile data
-; CHECK-LABEL: @const_trip_over_profile(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[TMP0]]
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 1
-; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <4 x i8> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[TMP4]], <4 x i8> <i8 2, i8 2, i8 2, i8 2>, <4 x i8> <i8 1, i8 1, i8 1, i8 1>
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>*
-; CHECK-NEXT: store <4 x i8> [[TMP5]], <4 x i8>* [[TMP6]], align 1
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 1001, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32 x i8], [32 x i8]* @tab, i32 0, i32 [[I_08]]
-; CHECK-NEXT: [[TMP8:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP8]], 0
-; CHECK-NEXT: [[DOT:%.*]] = select i1 [[CMP1]], i8 2, i8 1
-; CHECK-NEXT: store i8 [[DOT]], i8* [[ARRAYIDX]], align 1
-; CHECK-NEXT: [[INC]] = add nsw i32 [[I_08]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp slt i32 [[I_08]], 1000
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END]], !prof [[PROF0]], !llvm.loop [[LOOP12:![0-9]+]]
-; CHECK: for.end:
-; CHECK-NEXT: ret i32 0
-;
+
+; CHECK-LABEL: @const_trip_over_profile
+; CHECK: <{{[0-9]+}} x i8>
entry:
br label %for.body
ret i32 0
}
+; CHECK: [[LP3]] = !{!"branch_weights", i32 10, i32 2490}
+; CHECK: [[LP6]] = !{!"branch_weights", i32 10, i32 0}
; original loop has latchExitWeight=10 and backedgeTakenWeight=10,000,
; therefore estimatedBackedgeTakenCount=1,000 and estimatedTripCount=1,001.
; Vectorizing by 4 produces estimatedTripCounts of 1,001/4=250 and 1,001%4=1
define i8 @reduction_smin_trunc(i8* noalias nocapture %ptr) {
; CHECK-LABEL: @reduction_smin_trunc(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[SUM_02P:%.*]] = phi i32 [ [[TMP1:%.*]], [[FOR_BODY]] ], [ 256, [[ENTRY]] ]
-; CHECK-NEXT: [[SUM_02:%.*]] = and i32 [[SUM_02P]], 255
-; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[IV]] to i64
-; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[LOAD:%.*]] = load i8, i8* [[GEP]], align 1
-; CHECK-NEXT: [[EXT:%.*]] = sext i8 [[LOAD]] to i32
-; CHECK-NEXT: [[TMP1]] = call i32 @llvm.smin.i32(i32 [[SUM_02]], i32 [[EXT]])
-; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 256
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK: for.end:
-; CHECK-NEXT: [[RET:%.*]] = trunc i32 [[TMP1]] to i8
-; CHECK-NEXT: ret i8 [[RET]]
-;
+; CHECK-NOT: vector.body
+; CHECK-NOT: <8 x
+; CHECK: ret
entry:
br label %for.body
define i8 @reduction_umin_trunc(i8* noalias nocapture %ptr) {
; CHECK-LABEL: @reduction_umin_trunc(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[SUM_02P:%.*]] = phi i32 [ [[TMP1:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT: [[SUM_02:%.*]] = and i32 [[SUM_02P]], 255
-; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[IV]] to i64
-; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, i8* [[PTR:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[LOAD:%.*]] = load i8, i8* [[GEP]], align 1
-; CHECK-NEXT: [[EXT:%.*]] = zext i8 [[LOAD]] to i32
-; CHECK-NEXT: [[TMP1]] = call i32 @llvm.umin.i32(i32 [[SUM_02]], i32 [[EXT]])
-; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 256
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK: for.end:
-; CHECK-NEXT: [[RET:%.*]] = trunc i32 [[TMP1]] to i8
-; CHECK-NEXT: ret i8 [[RET]]
-;
+; CHECK-NOT: vector.body
+; CHECK-NOT: <8 x
+; CHECK: ret
entry:
br label %for.body
define i16 @reduction_smax_trunc(i16* noalias nocapture %ptr) {
; CHECK-LABEL: @reduction_smax_trunc(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[SUM_02P:%.*]] = phi i32 [ [[TMP1:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT: [[SUM_02:%.*]] = and i32 [[SUM_02P]], 65535
-; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[IV]] to i64
-; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i16, i16* [[PTR:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[LOAD:%.*]] = load i16, i16* [[GEP]], align 2
-; CHECK-NEXT: [[EXT:%.*]] = sext i16 [[LOAD]] to i32
-; CHECK-NEXT: [[TMP1]] = call i32 @llvm.smax.i32(i32 [[SUM_02]], i32 [[EXT]])
-; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 256
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK: for.end:
-; CHECK-NEXT: [[RET:%.*]] = trunc i32 [[TMP1]] to i16
-; CHECK-NEXT: ret i16 [[RET]]
-;
+; CHECK-NOT: vector.body
+; CHECK-NOT: <8 x
+; CHECK: ret
entry:
br label %for.body
define i16 @reduction_umax_trunc(i16* noalias nocapture %ptr) {
; CHECK-LABEL: @reduction_umax_trunc(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[SUM_02P:%.*]] = phi i32 [ [[TMP1:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT: [[SUM_02:%.*]] = and i32 [[SUM_02P]], 65535
-; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[IV]] to i64
-; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i16, i16* [[PTR:%.*]], i64 [[TMP0]]
-; CHECK-NEXT: [[LOAD:%.*]] = load i16, i16* [[GEP]], align 2
-; CHECK-NEXT: [[EXT:%.*]] = zext i16 [[LOAD]] to i32
-; CHECK-NEXT: [[TMP1]] = call i32 @llvm.umax.i32(i32 [[SUM_02]], i32 [[EXT]])
-; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[IV_NEXT]], 256
-; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK: for.end:
-; CHECK-NEXT: [[RET:%.*]] = trunc i32 [[TMP1]] to i16
-; CHECK-NEXT: ret i16 [[RET]]
-;
+; CHECK-NOT: vector.body
+; CHECK-NOT: <8 x
+; CHECK: ret
entry:
br label %for.body
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -loop-vectorize -force-vector-width=4 -S %s | FileCheck %s
@dst = external global [32 x i16], align 1
define void @blend_uniform_iv_trunc(i1 %c) {
; CHECK-LABEL: @blend_uniform_iv_trunc(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C:%.*]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT: [[MASK0:%.*]] = insertelement <4 x i1> poison, i1 %c, i32 0
+; CHECK-NEXT: [[MASK1:%.*]] = shufflevector <4 x i1> [[MASK0]], <4 x i1> poison, <4 x i32> zeroinitializer
+
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = trunc i64 [[INDEX]] to i16
-; CHECK-NEXT: [[TMP1:%.*]] = add i16 [[TMP0]], 0
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP1]], i32 0
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[INDEX]] to i16
+; CHECK-NEXT: [[TMP2:%.*]] = add i16 [[TMP1]], 0
+; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i16> poison, i16 [[TMP2]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT1]], <4 x i16> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i16> [[BROADCAST_SPLAT2]], <4 x i16> undef
-; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i16> [[PREDPHI]], i32 0
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @dst, i16 0, i16 [[TMP3]]
-; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, i16* [[TMP4]], i32 0
-; CHECK-NEXT: [[TMP6:%.*]] = bitcast i16* [[TMP5]] to <4 x i16>*
-; CHECK-NEXT: store <4 x i16> zeroinitializer, <4 x i16>* [[TMP6]], align 2
+; CHECK-NEXT: [[TMP3:%.*]] = xor <4 x i1> [[MASK1]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[MASK1]], <4 x i16> [[BROADCAST_SPLAT2]], <4 x i16> undef
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i16> [[PREDPHI]], i32 0
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @dst, i16 0, i16 [[TMP4]]
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP5]], i32 0
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <4 x i16>*
+; CHECK-NEXT: store <4 x i16> zeroinitializer, <4 x i16>* [[TMP7]], align 2
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
-; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 32, 32
-; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
-; CHECK: loop.header:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT: [[IV_TRUNC_2:%.*]] = trunc i64 [[IV]] to i16
-; CHECK-NEXT: br i1 [[C]], label [[LOOP_NEXT:%.*]], label [[LOOP_LATCH]]
-; CHECK: loop.next:
-; CHECK-NEXT: br label [[LOOP_LATCH]]
-; CHECK: loop.latch:
-; CHECK-NEXT: [[BLEND:%.*]] = phi i16 [ undef, [[LOOP_HEADER]] ], [ [[IV_TRUNC_2]], [[LOOP_NEXT]] ]
-; CHECK-NEXT: [[DST_PTR:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @dst, i16 0, i16 [[BLEND]]
-; CHECK-NEXT: store i16 0, i16* [[DST_PTR]], align 2
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[CMP439:%.*]] = icmp ult i64 [[IV]], 31
-; CHECK-NEXT: br i1 [[CMP439]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: exit:
-; CHECK-NEXT: ret void
+; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
+; CHECK-NEXT: br i1 [[TMP8]], label %middle.block, label %vector.body
;
-
entry:
br label %loop.header
define void @blend_uniform_iv(i1 %c) {
; CHECK-LABEL: @blend_uniform_iv(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C:%.*]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT: [[MASK0:%.*]] = insertelement <4 x i1> poison, i1 %c, i32 0
+; CHECK-NEXT: [[MASK1:%.*]] = shufflevector <4 x i1> [[MASK0]], <4 x i1> poison, <4 x i32> zeroinitializer
+
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[TMP0]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP1:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i64> [[BROADCAST_SPLAT2]], <4 x i64> undef
+; CHECK-NEXT: [[TMP1:%.*]] = xor <4 x i1> [[MASK1]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[MASK1]], <4 x i64> [[BROADCAST_SPLAT2]], <4 x i64> undef
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i64> [[PREDPHI]], i32 0
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @dst, i16 0, i64 [[TMP2]]
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[TMP3]], i32 0
; CHECK-NEXT: store <4 x i16> zeroinitializer, <4 x i16>* [[TMP5]], align 2
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 32, 32
-; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
-; CHECK: loop.header:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT: br i1 [[C]], label [[LOOP_NEXT:%.*]], label [[LOOP_LATCH]]
-; CHECK: loop.next:
-; CHECK-NEXT: br label [[LOOP_LATCH]]
-; CHECK: loop.latch:
-; CHECK-NEXT: [[BLEND:%.*]] = phi i64 [ undef, [[LOOP_HEADER]] ], [ [[IV]], [[LOOP_NEXT]] ]
-; CHECK-NEXT: [[DST_PTR:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @dst, i16 0, i64 [[BLEND]]
-; CHECK-NEXT: store i16 0, i16* [[DST_PTR]], align 2
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[CMP439:%.*]] = icmp ult i64 [[IV]], 31
-; CHECK-NEXT: br i1 [[CMP439]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK: exit:
-; CHECK-NEXT: ret void
+; CHECK-NEXT: br i1 [[TMP6]], label %middle.block, label %vector.body
;
-
entry:
br label %loop.header
define void @blend_chain_iv(i1 %c) {
; CHECK-LABEL: @blend_chain_iv(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
-; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C:%.*]], i32 0
-; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-NEXT: [[MASK0:%.*]] = insertelement <4 x i1> poison, i1 %c, i32 0
+; CHECK-NEXT: [[MASK1:%.*]] = shufflevector <4 x i1> [[MASK0]], <4 x i1> poison, <4 x i32> zeroinitializer
+
; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT: [[TMP1:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> [[TMP0]], <4 x i1> zeroinitializer
-; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> [[BROADCAST_SPLAT]], <4 x i1> zeroinitializer
-; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> [[VEC_IND]], <4 x i64> undef
-; CHECK-NEXT: [[TMP3:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], <i1 true, i1 true, i1 true, i1 true>
-; CHECK-NEXT: [[TMP4:%.*]] = or <4 x i1> [[TMP2]], [[TMP1]]
-; CHECK-NEXT: [[PREDPHI1:%.*]] = select <4 x i1> [[TMP4]], <4 x i64> [[PREDPHI]], <4 x i64> undef
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 0
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @dst, i16 0, i64 [[TMP5]]
-; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 1
-; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @dst, i16 0, i64 [[TMP7]]
-; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 2
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i1> [[MASK1]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT: [[TMP5:%.*]] = select <4 x i1> [[MASK1]], <4 x i1> [[TMP4]], <4 x i1> zeroinitializer
+; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[MASK1]], <4 x i1> [[MASK1]], <4 x i1> zeroinitializer
+; CHECK-NEXT: [[PREDPHI:%.*]] = select <4 x i1> [[TMP6]], <4 x i64> [[VEC_IND]], <4 x i64> undef
+; CHECK-NEXT: [[TMP7:%.*]] = xor <4 x i1> [[MASK1]], <i1 true, i1 true, i1 true, i1 true>
+; CHECK-NEXT: [[TMP8:%.*]] = or <4 x i1> [[TMP6]], [[TMP5]]
+; CHECK-NEXT: [[PREDPHI1:%.*]] = select <4 x i1> [[TMP8]], <4 x i64> [[PREDPHI]], <4 x i64> undef
+; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 0
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @dst, i16 0, i64 [[TMP9]]
-; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 3
+; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 1
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @dst, i16 0, i64 [[TMP11]]
-; CHECK-NEXT: store i16 0, i16* [[TMP6]], align 2
-; CHECK-NEXT: store i16 0, i16* [[TMP8]], align 2
+; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 2
+; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @dst, i16 0, i64 [[TMP13]]
+; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i64> [[PREDPHI1]], i32 3
+; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @dst, i16 0, i64 [[TMP15]]
; CHECK-NEXT: store i16 0, i16* [[TMP10]], align 2
; CHECK-NEXT: store i16 0, i16* [[TMP12]], align 2
+; CHECK-NEXT: store i16 0, i16* [[TMP14]], align 2
+; CHECK-NEXT: store i16 0, i16* [[TMP16]], align 2
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
-; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 32, 32
-; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 32, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
-; CHECK: loop.header:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT: br i1 [[C]], label [[LOOP_NEXT:%.*]], label [[LOOP_LATCH]]
-; CHECK: loop.next:
-; CHECK-NEXT: br i1 [[C]], label [[LOOP_NEXT_2:%.*]], label [[LOOP_NEXT_3:%.*]]
-; CHECK: loop.next.2:
-; CHECK-NEXT: br label [[LOOP_NEXT_3]]
-; CHECK: loop.next.3:
-; CHECK-NEXT: [[BLEND_1:%.*]] = phi i64 [ undef, [[LOOP_NEXT]] ], [ [[IV]], [[LOOP_NEXT_2]] ]
-; CHECK-NEXT: br label [[LOOP_LATCH]]
-; CHECK: loop.latch:
-; CHECK-NEXT: [[BLEND:%.*]] = phi i64 [ undef, [[LOOP_HEADER]] ], [ [[BLEND_1]], [[LOOP_NEXT_3]] ]
-; CHECK-NEXT: [[DST_PTR:%.*]] = getelementptr inbounds [32 x i16], [32 x i16]* @dst, i16 0, i64 [[BLEND]]
-; CHECK-NEXT: store i16 0, i16* [[DST_PTR]], align 2
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[CMP439:%.*]] = icmp ult i64 [[IV]], 31
-; CHECK-NEXT: br i1 [[CMP439]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK: exit:
-; CHECK-NEXT: ret void
+; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
+; CHECK-NEXT: br i1 [[TMP17]], label %middle.block, label %vector.body
;
-
entry:
br label %loop.header
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[PTR]], i32 [[TMP0]]
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP]], i32 1
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 1
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* [[NEXT_GEP]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <4 x i8>*
-; CHECK-NEXT: store <4 x i8> [[WIDE_LOAD]], <4 x i8>* [[TMP5]], align 1
+; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[PTR]], i32 [[TMP1]]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP]], i32 1
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <4 x i8>*
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP4]], align 1
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, i8* [[NEXT_GEP]], i32 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <4 x i8>*
+; CHECK-NEXT: store <4 x i8> [[WIDE_LOAD]], <4 x i8>* [[TMP6]], align 1
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[SIZE]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
; CHECK-NEXT: [[BUFF:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, i8* [[BUFF]], i32 1
; CHECK-NEXT: [[DEC]] = add nsw i32 [[DEC66]], -1
-; CHECK-NEXT: [[TMP7:%.*]] = load i8, i8* [[INCDEC_PTR]], align 1
-; CHECK-NEXT: store i8 [[TMP7]], i8* [[BUFF]], align 1
+; CHECK-NEXT: [[TMP8:%.*]] = load i8, i8* [[INCDEC_PTR]], align 1
+; CHECK-NEXT: store i8 [[TMP8]], i8* [[BUFF]], align 1
; CHECK-NEXT: [[TOBOOL11:%.*]] = icmp eq i32 [[DEC]], 0
; CHECK-NEXT: br i1 [[TOBOOL11]], label [[END]], label [[BODY]], !llvm.loop [[LOOP2:![0-9]+]]
; CHECK: end:
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[PTR]], i32 [[TMP0]]
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP]], i32 1
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[TMP1]], i32 0
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 1
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* [[NEXT_GEP]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <4 x i8>*
-; CHECK-NEXT: store <4 x i8> [[WIDE_LOAD]], <4 x i8>* [[TMP5]], align 1
+; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[PTR]], i32 [[TMP1]]
+; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP]], i32 1
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <4 x i8>*
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP4]], align 1
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, i8* [[NEXT_GEP]], i32 0
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <4 x i8>*
+; CHECK-NEXT: store <4 x i8> [[WIDE_LOAD]], <4 x i8>* [[TMP6]], align 1
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[SIZE]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
; CHECK-NEXT: [[BUFF:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i8, i8* [[BUFF]], i32 1
; CHECK-NEXT: [[DEC]] = add nsw i32 [[DEC66]], -1
-; CHECK-NEXT: [[TMP7:%.*]] = load i8, i8* [[INCDEC_PTR]], align 1
-; CHECK-NEXT: store i8 [[TMP7]], i8* [[BUFF]], align 1
+; CHECK-NEXT: [[TMP8:%.*]] = load i8, i8* [[INCDEC_PTR]], align 1
+; CHECK-NEXT: store i8 [[TMP8]], i8* [[BUFF]], align 1
; CHECK-NEXT: [[TOBOOL11:%.*]] = icmp eq i32 [[DEC]], 0
; CHECK-NEXT: br i1 [[TOBOOL11]], label [[END]], label [[BODY]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: end:
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -loop-vectorize -force-vector-width=2 -S %s | FileCheck %s
%s1 = type { [32000 x double], [32000 x double], [32000 x double] }
define i32 @load_with_pointer_phi_no_runtime_checks(%s1* %data) {
-; CHECK-LABEL: @load_with_pointer_phi_no_runtime_checks(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <2 x i64> [[VEC_IND]], <i64 15999, i64 15999>
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[S1:%.*]], %s1* [[DATA:%.*]], i64 0, i32 0, i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[S1]], %s1* [[DATA]], i64 0, i32 2, <2 x i64> [[VEC_IND]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[S1]], %s1* [[DATA]], i64 0, i32 1, <2 x i64> [[VEC_IND]]
-; CHECK-NEXT: [[TMP5:%.*]] = xor <2 x i1> [[TMP1]], <i1 true, i1 true>
-; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP1]], <2 x double*> [[TMP4]], <2 x double*> [[TMP3]]
-; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double*> [[PREDPHI]], i32 0
-; CHECK-NEXT: [[TMP7:%.*]] = load double, double* [[TMP6]], align 8
-; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double*> [[PREDPHI]], i32 1
-; CHECK-NEXT: [[TMP9:%.*]] = load double, double* [[TMP8]], align 8
-; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x double> poison, double [[TMP7]], i32 0
-; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x double> [[TMP10]], double [[TMP9]], i32 1
-; CHECK-NEXT: [[TMP12:%.*]] = fmul <2 x double> <double 3.000000e+00, double 3.000000e+00>, [[TMP11]]
-; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP14:%.*]] = bitcast double* [[TMP13]] to <2 x double>*
-; CHECK-NEXT: store <2 x double> [[TMP12]], <2 x double>* [[TMP14]], align 8
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
-; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32000
-; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 32000, 32000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 32000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
-; CHECK: loop.header:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[CMP5:%.*]] = icmp ult i64 [[IV]], 15999
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[S1]], %s1* [[DATA]], i64 0, i32 0, i64 [[IV]]
-; CHECK-NEXT: br i1 [[CMP5]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
-; CHECK: if.then:
-; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds [[S1]], %s1* [[DATA]], i64 0, i32 1, i64 [[IV]]
-; CHECK-NEXT: br label [[LOOP_LATCH]]
-; CHECK: if.else:
-; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds [[S1]], %s1* [[DATA]], i64 0, i32 2, i64 [[IV]]
-; CHECK-NEXT: br label [[LOOP_LATCH]]
-; CHECK: loop.latch:
-; CHECK-NEXT: [[GEP_2_SINK:%.*]] = phi double* [ [[GEP_2]], [[IF_ELSE]] ], [ [[GEP_1]], [[IF_THEN]] ]
-; CHECK-NEXT: [[V8:%.*]] = load double, double* [[GEP_2_SINK]], align 8
-; CHECK-NEXT: [[MUL16:%.*]] = fmul double 3.000000e+00, [[V8]]
-; CHECK-NEXT: store double [[MUL16]], double* [[ARRAYIDX]], align 8
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 32000
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: exit:
-; CHECK-NEXT: ret i32 10
+; CHECK-LABEL: @load_with_pointer_phi_no_runtime_checks
+; CHECK-NOT: memcheck
+; CHECK: vector.body:
;
entry:
br label %loop.header
}
define i32 @store_with_pointer_phi_no_runtime_checks(%s1* %data) {
-; CHECK-LABEL: @store_with_pointer_phi_no_runtime_checks(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <2 x i64> [[VEC_IND]], <i64 15999, i64 15999>
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[S1:%.*]], %s1* [[DATA:%.*]], i64 0, i32 0, i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[S1]], %s1* [[DATA]], i64 0, i32 2, <2 x i64> [[VEC_IND]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[S1]], %s1* [[DATA]], i64 0, i32 1, <2 x i64> [[VEC_IND]]
-; CHECK-NEXT: [[TMP5:%.*]] = xor <2 x i1> [[TMP1]], <i1 true, i1 true>
-; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP1]], <2 x double*> [[TMP4]], <2 x double*> [[TMP3]]
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[TMP6]] to <2 x double>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP7]], align 8
-; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x double> <double 3.000000e+00, double 3.000000e+00>, [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP8]], i32 0
-; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x double*> [[PREDPHI]], i32 0
-; CHECK-NEXT: store double [[TMP9]], double* [[TMP10]], align 8
-; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x double> [[TMP8]], i32 1
-; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x double*> [[PREDPHI]], i32 1
-; CHECK-NEXT: store double [[TMP11]], double* [[TMP12]], align 8
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
-; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32000
-; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 32000, 32000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 32000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
-; CHECK: loop.header:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[CMP5:%.*]] = icmp ult i64 [[IV]], 15999
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[S1]], %s1* [[DATA]], i64 0, i32 0, i64 [[IV]]
-; CHECK-NEXT: br i1 [[CMP5]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
-; CHECK: if.then:
-; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds [[S1]], %s1* [[DATA]], i64 0, i32 1, i64 [[IV]]
-; CHECK-NEXT: br label [[LOOP_LATCH]]
-; CHECK: if.else:
-; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds [[S1]], %s1* [[DATA]], i64 0, i32 2, i64 [[IV]]
-; CHECK-NEXT: br label [[LOOP_LATCH]]
-; CHECK: loop.latch:
-; CHECK-NEXT: [[GEP_2_SINK:%.*]] = phi double* [ [[GEP_2]], [[IF_ELSE]] ], [ [[GEP_1]], [[IF_THEN]] ]
-; CHECK-NEXT: [[V8:%.*]] = load double, double* [[ARRAYIDX]], align 8
-; CHECK-NEXT: [[MUL16:%.*]] = fmul double 3.000000e+00, [[V8]]
-; CHECK-NEXT: store double [[MUL16]], double* [[GEP_2_SINK]], align 8
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 32000
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK: exit:
-; CHECK-NEXT: ret i32 10
+; CHECK-LABEL: @store_with_pointer_phi_no_runtime_checks
+; CHECK-NOT: memcheck
+; CHECK: vector.body
;
entry:
br label %loop.header
}
define i32 @store_with_pointer_phi_runtime_checks(double* %A, double* %B, double* %C) {
-; CHECK-LABEL: @store_with_pointer_phi_runtime_checks(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[B1:%.*]] = bitcast double* [[B:%.*]] to i8*
-; CHECK-NEXT: [[C3:%.*]] = bitcast double* [[C:%.*]] to i8*
-; CHECK-NEXT: [[A6:%.*]] = bitcast double* [[A:%.*]] to i8*
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
-; CHECK: vector.memcheck:
-; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr double, double* [[B]], i64 32000
-; CHECK-NEXT: [[SCEVGEP2:%.*]] = bitcast double* [[SCEVGEP]] to i8*
-; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr double, double* [[C]], i64 32000
-; CHECK-NEXT: [[SCEVGEP45:%.*]] = bitcast double* [[SCEVGEP4]] to i8*
-; CHECK-NEXT: [[SCEVGEP7:%.*]] = getelementptr double, double* [[A]], i64 32000
-; CHECK-NEXT: [[SCEVGEP78:%.*]] = bitcast double* [[SCEVGEP7]] to i8*
-; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult i8* [[B1]], [[SCEVGEP45]]
-; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult i8* [[C3]], [[SCEVGEP2]]
-; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT: [[BOUND09:%.*]] = icmp ult i8* [[B1]], [[SCEVGEP78]]
-; CHECK-NEXT: [[BOUND110:%.*]] = icmp ult i8* [[A6]], [[SCEVGEP2]]
-; CHECK-NEXT: [[FOUND_CONFLICT11:%.*]] = and i1 [[BOUND09]], [[BOUND110]]
-; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT11]]
-; CHECK-NEXT: [[BOUND012:%.*]] = icmp ult i8* [[C3]], [[SCEVGEP78]]
-; CHECK-NEXT: [[BOUND113:%.*]] = icmp ult i8* [[A6]], [[SCEVGEP45]]
-; CHECK-NEXT: [[FOUND_CONFLICT14:%.*]] = and i1 [[BOUND012]], [[BOUND113]]
-; CHECK-NEXT: [[CONFLICT_RDX15:%.*]] = or i1 [[CONFLICT_RDX]], [[FOUND_CONFLICT14]]
-; CHECK-NEXT: br i1 [[CONFLICT_RDX15]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <2 x i64> [[VEC_IND]], <i64 15999, i64 15999>
-; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP0]]
-; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[C]], <2 x i64> [[VEC_IND]]
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, double* [[B]], <2 x i64> [[VEC_IND]]
-; CHECK-NEXT: [[TMP5:%.*]] = xor <2 x i1> [[TMP1]], <i1 true, i1 true>
-; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP1]], <2 x double*> [[TMP4]], <2 x double*> [[TMP3]]
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, double* [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[TMP6]] to <2 x double>*
-; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, <2 x double>* [[TMP7]], align 8, !alias.scope !6
-; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x double> <double 3.000000e+00, double 3.000000e+00>, [[WIDE_LOAD]]
-; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP8]], i32 0
-; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x double*> [[PREDPHI]], i32 0
-; CHECK-NEXT: store double [[TMP9]], double* [[TMP10]], align 8
-; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x double> [[TMP8]], i32 1
-; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x double*> [[PREDPHI]], i32 1
-; CHECK-NEXT: store double [[TMP11]], double* [[TMP12]], align 8
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
-; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32000
-; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 32000, 32000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 32000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
-; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
-; CHECK: loop.header:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[CMP5:%.*]] = icmp ult i64 [[IV]], 15999
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[IV]]
-; CHECK-NEXT: br i1 [[CMP5]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
-; CHECK: if.then:
-; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[IV]]
-; CHECK-NEXT: br label [[LOOP_LATCH]]
-; CHECK: if.else:
-; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds double, double* [[C]], i64 [[IV]]
-; CHECK-NEXT: br label [[LOOP_LATCH]]
-; CHECK: loop.latch:
-; CHECK-NEXT: [[GEP_2_SINK:%.*]] = phi double* [ [[GEP_2]], [[IF_ELSE]] ], [ [[GEP_1]], [[IF_THEN]] ]
-; CHECK-NEXT: [[V8:%.*]] = load double, double* [[ARRAYIDX]], align 8
-; CHECK-NEXT: [[MUL16:%.*]] = fmul double 3.000000e+00, [[V8]]
-; CHECK-NEXT: store double [[MUL16]], double* [[GEP_2_SINK]], align 8
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 32000
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP_HEADER]], !llvm.loop [[LOOP10:![0-9]+]]
-; CHECK: exit:
-; CHECK-NEXT: ret i32 10
+; CHECK-LABEL: @store_with_pointer_phi_runtime_checks
+; CHECK: memcheck
+; CHECK: vector.body
;
entry:
br label %loop.header
}
define i32 @load_with_pointer_phi_outside_loop(double* %A, double* %B, double* %C, i1 %c.0, i1 %c.1) {
-; CHECK-LABEL: @load_with_pointer_phi_outside_loop(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 [[C_0:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
-; CHECK: if.then:
-; CHECK-NEXT: br label [[LOOP_PH:%.*]]
-; CHECK: if.else:
-; CHECK-NEXT: [[PTR_SELECT:%.*]] = select i1 [[C_1:%.*]], double* [[C:%.*]], double* [[B:%.*]]
-; CHECK-NEXT: br label [[LOOP_PH]]
-; CHECK: loop.ph:
-; CHECK-NEXT: [[PTR:%.*]] = phi double* [ [[A:%.*]], [[IF_THEN]] ], [ [[PTR_SELECT]], [[IF_ELSE]] ]
-; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
-; CHECK: loop.header:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[LOOP_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_HEADER]] ]
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[IV]]
-; CHECK-NEXT: [[V8:%.*]] = load double, double* [[PTR]], align 8
-; CHECK-NEXT: [[MUL16:%.*]] = fmul double 3.000000e+00, [[V8]]
-; CHECK-NEXT: store double [[MUL16]], double* [[ARRAYIDX]], align 8
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 32000
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[LOOP_HEADER]]
-; CHECK: exit:
-; CHECK-NEXT: ret i32 10
+; CHECK-LABEL: @load_with_pointer_phi_outside_loop
+; CHECK-NOT: vector.body
;
entry:
br i1 %c.0, label %if.then, label %if.else
}
define i32 @store_with_pointer_phi_outside_loop(double* %A, double* %B, double* %C, i1 %c.0, i1 %c.1) {
-; CHECK-LABEL: @store_with_pointer_phi_outside_loop(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 [[C_0:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
-; CHECK: if.then:
-; CHECK-NEXT: br label [[LOOP_PH:%.*]]
-; CHECK: if.else:
-; CHECK-NEXT: [[PTR_SELECT:%.*]] = select i1 [[C_1:%.*]], double* [[C:%.*]], double* [[B:%.*]]
-; CHECK-NEXT: br label [[LOOP_PH]]
-; CHECK: loop.ph:
-; CHECK-NEXT: [[PTR:%.*]] = phi double* [ [[A:%.*]], [[IF_THEN]] ], [ [[PTR_SELECT]], [[IF_ELSE]] ]
-; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
-; CHECK: loop.header:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[LOOP_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_HEADER]] ]
-; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[IV]]
-; CHECK-NEXT: [[V8:%.*]] = load double, double* [[ARRAYIDX]], align 8
-; CHECK-NEXT: [[MUL16:%.*]] = fmul double 3.000000e+00, [[V8]]
-; CHECK-NEXT: store double [[MUL16]], double* [[PTR]], align 8
-; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 32000
-; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[EXIT:%.*]], label [[LOOP_HEADER]]
-; CHECK: exit:
-; CHECK-NEXT: ret i32 10
+; CHECK-LABEL: @store_with_pointer_phi_outside_loop
+; CHECK-NOT: vector.body
;
entry:
br i1 %c.0, label %if.then, label %if.else
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -passes=loop-vectorize -S 2>&1 | FileCheck %s
%type = type { [3 x double] }
define void @getScalarFunc(double* %A, double* %C, %type* %B) {
-; CHECK-LABEL: @getScalarFunc(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br label [[FOR_BODY:%.*]]
-; CHECK: for.body:
-; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[DUMMYLOAD2:%.*]] = load double, double* [[A:%.*]], align 8
-; CHECK-NEXT: [[ARRAYIDX_I24:%.*]] = getelementptr inbounds [[TYPE:%.*]], %type* [[B:%.*]], i64 [[I]], i32 0, i32 0
-; CHECK-NEXT: [[_15:%.*]] = load double, double* [[ARRAYIDX_I24]], align 8
-; CHECK-NEXT: [[CALL10:%.*]] = tail call fast double @atan(double [[_15]]) #[[ATTR0:[0-9]+]]
-; CHECK-NEXT: [[INC]] = add i64 [[I]], 1
-; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i64 1000, [[INC]]
-; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
-; CHECK: for.end:
-; CHECK-NEXT: ret void
-;
+; CHECK-LABEL: getScalarFunc
; This check will catch also the massv version of the function.
+; CHECK-NOT: call fast <{{[0-9]+}} x double> @{{.*}}atan(<{{[0-9]+}} x double> %{{[0-9]+}})
entry:
br label %for.body
-for.body:
+for.body:
%i = phi i64 [ %inc, %for.body ], [ 0, %entry ]
%dummyload2 = load double, double* %A, align 8
%arrayidx.i24 = getelementptr inbounds %type, %type* %B, i64 %i, i32 0, i32 0
%cmp = icmp ugt i64 1000, %inc
br i1 %cmp, label %for.body, label %for.end
-for.end:
+for.end:
ret void
}
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR1_LATCH:%.*]], label [[FOR2_HEADER]]
; CHECK: for1.latch:
; CHECK-NEXT: [[C:%.*]] = call i1 @cond()
-; CHECK-NEXT: br i1 [[C]], label [[EXIT:%.*]], label [[FOR1_HEADER]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT: br i1 [[C]], label [[EXIT:%.*]], label [[FOR1_HEADER]], !llvm.loop !0
; CHECK: exit:
; CHECK-NEXT: ret void
;
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -loop-vectorize -force-vector-width=4 -enable-vplan-native-path -S %s | FileCheck %s
; Vectorize explict marked outer loop using vplan native path. Inner loop
; }
define void @inner_loop_reduction(double* noalias nocapture readonly %a.in, double* noalias nocapture readonly %b.in, double* noalias nocapture %c.out) {
; CHECK-LABEL: @inner_loop_reduction(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR1_LATCH4:%.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR1_LATCH4]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds double, double* [[A_IN:%.*]], <4 x i64> [[VEC_IND]]
-; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> [[TMP0]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x double> undef)
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds double, double* [[B_IN:%.*]], <4 x i64> [[VEC_IND]]
-; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> [[TMP1]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x double> undef)
-; CHECK-NEXT: br label [[FOR2_HEADER2:%.*]]
-; CHECK: for2.header2:
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP3:%.*]], [[FOR2_HEADER2]] ]
-; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x double> [ [[WIDE_MASKED_GATHER]], [[VECTOR_BODY]] ], [ [[TMP2:%.*]], [[FOR2_HEADER2]] ]
-; CHECK-NEXT: [[TMP2]] = fadd <4 x double> [[WIDE_MASKED_GATHER1]], [[VEC_PHI3]]
-; CHECK-NEXT: [[TMP3]] = add nuw nsw <4 x i32> [[VEC_PHI]], <i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <4 x i32> [[TMP3]], <i32 10000, i32 10000, i32 10000, i32 10000>
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0
-; CHECK-NEXT: br i1 [[TMP5]], label [[FOR1_LATCH4]], label [[FOR2_HEADER2]]
-; CHECK: for1.latch4:
-; CHECK-NEXT: [[VEC_PHI5:%.*]] = phi <4 x double> [ [[TMP2]], [[FOR2_HEADER2]] ]
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, double* [[C_OUT:%.*]], <4 x i64> [[VEC_IND]]
-; CHECK-NEXT: call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> [[VEC_PHI5]], <4 x double*> [[TMP6]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
-; CHECK-NEXT: [[TMP7:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
-; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <4 x i64> [[TMP7]], <i64 1000, i64 1000, i64 1000, i64 1000>
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR1_HEADER:%.*]]
-; CHECK: for1.header:
-; CHECK-NEXT: [[INDVAR1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVAR11:%.*]], [[FOR1_LATCH:%.*]] ]
-; CHECK-NEXT: [[A_PTR:%.*]] = getelementptr inbounds double, double* [[A_IN]], i64 [[INDVAR1]]
-; CHECK-NEXT: [[A:%.*]] = load double, double* [[A_PTR]], align 8
-; CHECK-NEXT: [[B_PTR:%.*]] = getelementptr inbounds double, double* [[B_IN]], i64 [[INDVAR1]]
-; CHECK-NEXT: [[B:%.*]] = load double, double* [[B_PTR]], align 8
-; CHECK-NEXT: br label [[FOR2_HEADER:%.*]]
-; CHECK: for2.header:
-; CHECK-NEXT: [[INDVAR2:%.*]] = phi i32 [ 0, [[FOR1_HEADER]] ], [ [[INDVAR21:%.*]], [[FOR2_HEADER]] ]
-; CHECK-NEXT: [[A_REDUCTION:%.*]] = phi double [ [[A]], [[FOR1_HEADER]] ], [ [[A_REDUCTION1:%.*]], [[FOR2_HEADER]] ]
-; CHECK-NEXT: [[A_REDUCTION1]] = fadd double [[B]], [[A_REDUCTION]]
-; CHECK-NEXT: [[INDVAR21]] = add nuw nsw i32 [[INDVAR2]], 1
-; CHECK-NEXT: [[FOR2_COND:%.*]] = icmp eq i32 [[INDVAR21]], 10000
-; CHECK-NEXT: br i1 [[FOR2_COND]], label [[FOR1_LATCH]], label [[FOR2_HEADER]]
-; CHECK: for1.latch:
-; CHECK-NEXT: [[A_REDUCTION1_LCSSA:%.*]] = phi double [ [[A_REDUCTION1]], [[FOR2_HEADER]] ]
-; CHECK-NEXT: [[C_PTR:%.*]] = getelementptr inbounds double, double* [[C_OUT]], i64 [[INDVAR1]]
-; CHECK-NEXT: store double [[A_REDUCTION1_LCSSA]], double* [[C_PTR]], align 8
-; CHECK-NEXT: [[INDVAR11]] = add nuw nsw i64 [[INDVAR1]], 1
-; CHECK-NEXT: [[FOR1_COND:%.*]] = icmp eq i64 [[INDVAR11]], 1000
-; CHECK-NEXT: br i1 [[FOR1_COND]], label [[EXIT]], label [[FOR1_HEADER]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: exit:
-; CHECK-NEXT: ret void
-;
+; CHECK: vector.body:
+; CHECK-NEXT: %[[FOR1_INDEX:.*]] = phi i64 [ 0, %[[LABEL_PR:.*]] ], [ %{{.*}}, %[[LABEL_FOR1_LATCH:.*]] ]
+; CHECK: %[[VEC_INDEX:.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[LABEL_PR]] ], [ %{{.*}}, %[[LABEL_FOR1_LATCH]] ]
+; CHECK-NEXT: %[[A_PTR:.*]] = getelementptr inbounds double, double* %a.in, <4 x i64> %[[VEC_INDEX]]
+; CHECK-NEXT: %[[MASKED_GATHER1:.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> %[[A_PTR]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x double> undef)
+; CHECK-NEXT: %[[B_PTR:.*]] = getelementptr inbounds double, double* %b.in, <4 x i64> %[[VEC_INDEX]]
+; CHECK-NEXT: %[[MASKED_GATHER2:.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> %[[B_PTR]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x double> undef)
+; CHECK-NEXT: br label %[[FOR2_HEADER:.*]]
+; CHECK: [[FOR2_HEADER]]:
+; CHECK-NEXT: %[[FOR2_INDEX:.*]] = phi <4 x i32> [ zeroinitializer, %vector.body ], [ %[[FOR2_INDEX_NEXT:.*]], %[[FOR2_HEADER]] ]
+; CHECK-NEXT: %[[REDUCTION:.*]] = phi <4 x double> [ %[[MASKED_GATHER1]], %vector.body ], [ %[[REDUCTION_NEXT:.*]], %[[FOR2_HEADER]] ]
+; CHECK-NEXT: %[[REDUCTION_NEXT]] = fadd <4 x double> %[[MASKED_GATHER2]], %[[REDUCTION]]
+; CHECK-NEXT: %[[FOR2_INDEX_NEXT]] = add nuw nsw <4 x i32> %[[FOR2_INDEX]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: %[[VEC_PTR:.*]] = icmp eq <4 x i32> %[[FOR2_INDEX_NEXT]], <i32 10000, i32 10000, i32 10000, i32 10000>
+; CHECK-NEXT: %[[EXIT_COND:.*]] = extractelement <4 x i1> %[[VEC_PTR]], i32 0
+; CHECK-NEXT: br i1 %[[EXIT_COND]], label %[[FOR1_LATCH:.*]], label %{{.*}}
+; CHECK: [[FOR1_LATCH]]:
+; CHECK-NEXT: %[[REDUCTION:.*]] = phi <4 x double> [ %[[REDUCTION_NEXT]], %[[FOR2_HEADER]] ]
+; CHECK-NEXT: %[[C_PTR:.*]] = getelementptr inbounds double, double* %c.out, <4 x i64> %[[VEC_INDEX]]
+; CHECK-NEXT: call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> %[[REDUCTION]], <4 x double*> %[[C_PTR]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT: %[[VEC_INDEX_NEXT:.*]] = add nuw nsw <4 x i64> %[[VEC_INDEX]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT: %[[VEC_PTR:.*]] = icmp eq <4 x i64> %[[VEC_INDEX_NEXT]], <i64 1000, i64 1000, i64 1000, i64 1000>
+; CHECK-NEXT: %[[FOR1_INDEX_NEXT:.*]] = add nuw i64 %[[FOR1_INDEX]], 4
+; CHECK-NEXT: %{{.*}} = add <4 x i64> %[[VEC_INDEX]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT: %[[EXIT_COND:.*]] = icmp eq i64 %[[FOR1_INDEX_NEXT]], 1000
+; CHECK-NEXT: br i1 %[[EXIT_COND]], label %{{.*}}, label %vector.body
entry:
br label %for1.header
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -loop-vectorize -force-vector-width=4 -enable-vplan-native-path -S %s | FileCheck %s
; Test that VPlan native path is able to widen call intructions like
declare double @llvm.sqrt.f64(double %0)
define void @widen_call_instruction(double* noalias nocapture readonly %a.in, double* noalias nocapture readonly %b.in, double* noalias nocapture %c.out) {
; CHECK-LABEL: @widen_call_instruction(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK: vector.ph:
-; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK: vector.body:
-; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR1_LATCH4:%.*]] ]
-; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR1_LATCH4]] ]
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds double, double* [[A_IN:%.*]], <4 x i64> [[VEC_IND]]
-; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> [[TMP0]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x double> undef)
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds double, double* [[B_IN:%.*]], <4 x i64> [[VEC_IND]]
-; CHECK-NEXT: [[WIDE_MASKED_GATHER1:%.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> [[TMP1]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x double> undef)
-; CHECK-NEXT: [[TMP2:%.*]] = call <4 x double> @llvm.sqrt.v4f64(<4 x double> [[WIDE_MASKED_GATHER1]])
-; CHECK-NEXT: br label [[FOR2_HEADER2:%.*]]
-; CHECK: for2.header2:
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP4:%.*]], [[FOR2_HEADER2]] ]
-; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi <4 x double> [ [[WIDE_MASKED_GATHER]], [[VECTOR_BODY]] ], [ [[TMP3:%.*]], [[FOR2_HEADER2]] ]
-; CHECK-NEXT: [[TMP3]] = fadd <4 x double> [[TMP2]], [[VEC_PHI3]]
-; CHECK-NEXT: [[TMP4]] = add nuw nsw <4 x i32> [[VEC_PHI]], <i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i32> [[TMP4]], <i32 10000, i32 10000, i32 10000, i32 10000>
-; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0
-; CHECK-NEXT: br i1 [[TMP6]], label [[FOR1_LATCH4]], label [[FOR2_HEADER2]]
-; CHECK: for1.latch4:
-; CHECK-NEXT: [[VEC_PHI5:%.*]] = phi <4 x double> [ [[TMP3]], [[FOR2_HEADER2]] ]
-; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds double, double* [[C_OUT:%.*]], <4 x i64> [[VEC_IND]]
-; CHECK-NEXT: call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> [[VEC_PHI5]], <4 x double*> [[TMP7]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
-; CHECK-NEXT: [[TMP8:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
-; CHECK-NEXT: [[TMP9:%.*]] = icmp eq <4 x i64> [[TMP8]], <i64 1000, i64 1000, i64 1000, i64 1000>
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR1_HEADER:%.*]]
-; CHECK: for1.header:
-; CHECK-NEXT: [[INDVAR1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVAR11:%.*]], [[FOR1_LATCH:%.*]] ]
-; CHECK-NEXT: [[A_PTR:%.*]] = getelementptr inbounds double, double* [[A_IN]], i64 [[INDVAR1]]
-; CHECK-NEXT: [[A:%.*]] = load double, double* [[A_PTR]], align 8
-; CHECK-NEXT: [[B_PTR:%.*]] = getelementptr inbounds double, double* [[B_IN]], i64 [[INDVAR1]]
-; CHECK-NEXT: [[B:%.*]] = load double, double* [[B_PTR]], align 8
-; CHECK-NEXT: [[B_SQRT:%.*]] = call double @llvm.sqrt.f64(double [[B]])
-; CHECK-NEXT: br label [[FOR2_HEADER:%.*]]
-; CHECK: for2.header:
-; CHECK-NEXT: [[INDVAR2:%.*]] = phi i32 [ 0, [[FOR1_HEADER]] ], [ [[INDVAR21:%.*]], [[FOR2_HEADER]] ]
-; CHECK-NEXT: [[A_REDUCTION:%.*]] = phi double [ [[A]], [[FOR1_HEADER]] ], [ [[A_REDUCTION1:%.*]], [[FOR2_HEADER]] ]
-; CHECK-NEXT: [[A_REDUCTION1]] = fadd double [[B_SQRT]], [[A_REDUCTION]]
-; CHECK-NEXT: [[INDVAR21]] = add nuw nsw i32 [[INDVAR2]], 1
-; CHECK-NEXT: [[FOR2_COND:%.*]] = icmp eq i32 [[INDVAR21]], 10000
-; CHECK-NEXT: br i1 [[FOR2_COND]], label [[FOR1_LATCH]], label [[FOR2_HEADER]]
-; CHECK: for1.latch:
-; CHECK-NEXT: [[A_REDUCTION1_LCSSA:%.*]] = phi double [ [[A_REDUCTION1]], [[FOR2_HEADER]] ]
-; CHECK-NEXT: [[C_PTR:%.*]] = getelementptr inbounds double, double* [[C_OUT]], i64 [[INDVAR1]]
-; CHECK-NEXT: store double [[A_REDUCTION1_LCSSA]], double* [[C_PTR]], align 8
-; CHECK-NEXT: [[INDVAR11]] = add nuw nsw i64 [[INDVAR1]], 1
-; CHECK-NEXT: [[FOR1_COND:%.*]] = icmp eq i64 [[INDVAR11]], 1000
-; CHECK-NEXT: br i1 [[FOR1_COND]], label [[EXIT]], label [[FOR1_HEADER]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: exit:
-; CHECK-NEXT: ret void
-;
+; CHECK: vector.body:
+; CHECK-NEXT: %[[FOR1_INDEX:.*]] = phi i64 [ 0, %[[LABEL_PR:.*]] ], [ %{{.*}}, %[[LABEL_FOR1_LATCH:.*]] ]
+; CHECK: %[[VEC_INDEX:.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[LABEL_PR]] ], [ %{{.*}}, %[[LABEL_FOR1_LATCH]] ]
+; CHECK-NEXT: %[[A_PTR:.*]] = getelementptr inbounds double, double* %a.in, <4 x i64> %[[VEC_INDEX]]
+; CHECK-NEXT: %[[MASKED_GATHER1:.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> %[[A_PTR]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x double> undef)
+; CHECK-NEXT: %[[B_PTR:.*]] = getelementptr inbounds double, double* %b.in, <4 x i64> %[[VEC_INDEX]]
+; CHECK-NEXT: %[[MASKED_GATHER2:.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> %[[B_PTR]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x double> undef)
+; CHECK-NEXT: %[[B_SQRT:.*]] = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %[[MASKED_GATHER2]])
+; CHECK-NEXT: br label %[[FOR2_HEADER:.*]]
+; CHECK: [[FOR2_HEADER]]:
+; CHECK-NEXT: %[[FOR2_INDEX:.*]] = phi <4 x i32> [ zeroinitializer, %vector.body ], [ %[[FOR2_INDEX_NEXT:.*]], %[[FOR2_HEADER]] ]
+; CHECK-NEXT: %[[REDUCTION:.*]] = phi <4 x double> [ %[[MASKED_GATHER1]], %vector.body ], [ %[[REDUCTION_NEXT:.*]], %[[FOR2_HEADER]] ]
+; CHECK-NEXT: %[[REDUCTION_NEXT]] = fadd <4 x double> %[[B_SQRT]], %[[REDUCTION]]
+; CHECK-NEXT: %[[FOR2_INDEX_NEXT]] = add nuw nsw <4 x i32> %[[FOR2_INDEX]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT: %[[VEC_PTR:.*]] = icmp eq <4 x i32> %[[FOR2_INDEX_NEXT]], <i32 10000, i32 10000, i32 10000, i32 10000>
+; CHECK-NEXT: %[[EXIT_COND:.*]] = extractelement <4 x i1> %[[VEC_PTR]], i32 0
+; CHECK-NEXT: br i1 %[[EXIT_COND]], label %[[FOR1_LATCH:.*]], label %{{.*}}
+; CHECK: [[FOR1_LATCH]]:
+; CHECK-NEXT: %[[REDUCTION:.*]] = phi <4 x double> [ %[[REDUCTION_NEXT]], %[[FOR2_HEADER]] ]
+; CHECK-NEXT: %[[C_PTR:.*]] = getelementptr inbounds double, double* %c.out, <4 x i64> %[[VEC_INDEX]]
+; CHECK-NEXT: call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> %[[REDUCTION]], <4 x double*> %[[C_PTR]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT: %[[VEC_INDEX_NEXT:.*]] = add nuw nsw <4 x i64> %[[VEC_INDEX]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT: %[[VEC_PTR:.*]] = icmp eq <4 x i64> %[[VEC_INDEX_NEXT]], <i64 1000, i64 1000, i64 1000, i64 1000>
+; CHECK-NEXT: %[[FOR1_INDEX_NEXT:.*]] = add nuw i64 %[[FOR1_INDEX]], 4
+; CHECK-NEXT: %{{.*}} = add <4 x i64> %[[VEC_INDEX]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK-NEXT: %[[EXIT_COND:.*]] = icmp eq i64 %[[FOR1_INDEX_NEXT]], 1000
+; CHECK-NEXT: br i1 %[[EXIT_COND]], label %{{.*}}, label %vector.body
entry:
br label %for1.header
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -loop-vectorize -force-vector-width=4 -enable-vplan-native-path -S %s | FileCheck %s
; Test that VPlan native path is able to widen select instruction in the
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_BODY]] ], [ [[TMP2:%.*]], [[FOR2_HEADER1]] ]
; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[SELECT:%.*]], <4 x double> [[BROADCAST_SPLAT]], <4 x double> [[BROADCAST_SPLAT3]]
; CHECK-NEXT: call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> [[TMP1]], <4 x double*> [[TMP0]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
-; CHECK-NEXT: [[TMP2]] = add nuw nsw <4 x i64> [[VEC_PHI]], <i64 1, i64 1, i64 1, i64 1>
-; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i64> [[TMP2]], <i64 10000, i64 10000, i64 10000, i64 10000>
-; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i32 0
-; CHECK-NEXT: br i1 [[TMP4]], label [[FOR1_LATCH4]], label [[FOR2_HEADER1]]
-; CHECK: for1.latch4:
-; CHECK-NEXT: [[TMP5:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq <4 x i64> [[TMP5]], <i64 1000, i64 1000, i64 1000, i64 1000>
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR1_HEADER:%.*]]
-; CHECK: for1.header:
-; CHECK-NEXT: [[INDVAR1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVAR11:%.*]], [[FOR1_LATCH:%.*]] ]
-; CHECK-NEXT: [[PTR:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVAR1]]
-; CHECK-NEXT: br label [[FOR2_HEADER:%.*]]
-; CHECK: for2.header:
-; CHECK-NEXT: [[INDVAR2:%.*]] = phi i64 [ 0, [[FOR1_HEADER]] ], [ [[INDVAR21:%.*]], [[FOR2_HEADER]] ]
-; CHECK-NEXT: [[SELECT_B:%.*]] = select i1 [[SELECT]], double [[A]], double [[B]]
-; CHECK-NEXT: store double [[SELECT_B]], double* [[PTR]], align 8
-; CHECK-NEXT: [[INDVAR21]] = add nuw nsw i64 [[INDVAR2]], 1
-; CHECK-NEXT: [[FOR2_COND:%.*]] = icmp eq i64 [[INDVAR21]], 10000
-; CHECK-NEXT: br i1 [[FOR2_COND]], label [[FOR1_LATCH]], label [[FOR2_HEADER]]
-; CHECK: for1.latch:
-; CHECK-NEXT: [[INDVAR11]] = add nuw nsw i64 [[INDVAR1]], 1
-; CHECK-NEXT: [[FOR1_COND:%.*]] = icmp eq i64 [[INDVAR11]], 1000
-; CHECK-NEXT: br i1 [[FOR1_COND]], label [[EXIT]], label [[FOR1_HEADER]], !llvm.loop [[LOOP2:![0-9]+]]
-; CHECK: exit:
-; CHECK-NEXT: ret void
-;
entry:
br label %for1.header
; CHECK-NEXT: [[TMP1:%.*]] = trunc <4 x i64> [[VEC_IND]] to <4 x i1>
; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x double> [[BROADCAST_SPLAT]], <4 x double> [[BROADCAST_SPLAT3]]
; CHECK-NEXT: call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> [[TMP2]], <4 x double*> [[TMP0]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
-; CHECK-NEXT: [[TMP3]] = add nuw nsw <4 x i64> [[VEC_PHI]], <i64 1, i64 1, i64 1, i64 1>
-; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <4 x i64> [[TMP3]], <i64 10000, i64 10000, i64 10000, i64 10000>
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0
-; CHECK-NEXT: br i1 [[TMP5]], label [[FOR1_LATCH4]], label [[FOR2_HEADER1]]
-; CHECK: for1.latch4:
-; CHECK-NEXT: [[TMP6:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
-; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <4 x i64> [[TMP6]], <i64 1000, i64 1000, i64 1000, i64 1000>
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR1_HEADER:%.*]]
-; CHECK: for1.header:
-; CHECK-NEXT: [[INDVAR1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVAR11:%.*]], [[FOR1_LATCH:%.*]] ]
-; CHECK-NEXT: [[PTR:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVAR1]]
-; CHECK-NEXT: br label [[FOR2_HEADER:%.*]]
-; CHECK: for2.header:
-; CHECK-NEXT: [[INDVAR2:%.*]] = phi i64 [ 0, [[FOR1_HEADER]] ], [ [[INDVAR21:%.*]], [[FOR2_HEADER]] ]
-; CHECK-NEXT: [[SELECT:%.*]] = trunc i64 [[INDVAR1]] to i1
-; CHECK-NEXT: [[SELECT_B:%.*]] = select i1 [[SELECT]], double [[A]], double [[B]]
-; CHECK-NEXT: store double [[SELECT_B]], double* [[PTR]], align 8
-; CHECK-NEXT: [[INDVAR21]] = add nuw nsw i64 [[INDVAR2]], 1
-; CHECK-NEXT: [[FOR2_COND:%.*]] = icmp eq i64 [[INDVAR21]], 10000
-; CHECK-NEXT: br i1 [[FOR2_COND]], label [[FOR1_LATCH]], label [[FOR2_HEADER]]
-; CHECK: for1.latch:
-; CHECK-NEXT: [[INDVAR11]] = add nuw nsw i64 [[INDVAR1]], 1
-; CHECK-NEXT: [[FOR1_COND:%.*]] = icmp eq i64 [[INDVAR11]], 1000
-; CHECK-NEXT: br i1 [[FOR1_COND]], label [[EXIT]], label [[FOR1_HEADER]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK: exit:
-; CHECK-NEXT: ret void
-;
entry:
br label %for1.header
; CHECK-NEXT: [[TMP1:%.*]] = trunc <4 x i64> [[VEC_PHI]] to <4 x i1>
; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x double> [[BROADCAST_SPLAT]], <4 x double> [[BROADCAST_SPLAT3]]
; CHECK-NEXT: call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> [[TMP2]], <4 x double*> [[TMP0]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
-; CHECK-NEXT: [[TMP3]] = add nuw nsw <4 x i64> [[VEC_PHI]], <i64 1, i64 1, i64 1, i64 1>
-; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <4 x i64> [[TMP3]], <i64 10000, i64 10000, i64 10000, i64 10000>
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP4]], i32 0
-; CHECK-NEXT: br i1 [[TMP5]], label [[FOR1_LATCH4]], label [[FOR2_HEADER1]]
-; CHECK: for1.latch4:
-; CHECK-NEXT: [[TMP6:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
-; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <4 x i64> [[TMP6]], <i64 1000, i64 1000, i64 1000, i64 1000>
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR1_HEADER:%.*]]
-; CHECK: for1.header:
-; CHECK-NEXT: [[INDVAR1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVAR11:%.*]], [[FOR1_LATCH:%.*]] ]
-; CHECK-NEXT: [[PTR:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVAR1]]
-; CHECK-NEXT: br label [[FOR2_HEADER:%.*]]
-; CHECK: for2.header:
-; CHECK-NEXT: [[INDVAR2:%.*]] = phi i64 [ 0, [[FOR1_HEADER]] ], [ [[INDVAR21:%.*]], [[FOR2_HEADER]] ]
-; CHECK-NEXT: [[SELECT:%.*]] = trunc i64 [[INDVAR2]] to i1
-; CHECK-NEXT: [[SELECT_B:%.*]] = select i1 [[SELECT]], double [[A]], double [[B]]
-; CHECK-NEXT: store double [[SELECT_B]], double* [[PTR]], align 8
-; CHECK-NEXT: [[INDVAR21]] = add nuw nsw i64 [[INDVAR2]], 1
-; CHECK-NEXT: [[FOR2_COND:%.*]] = icmp eq i64 [[INDVAR21]], 10000
-; CHECK-NEXT: br i1 [[FOR2_COND]], label [[FOR1_LATCH]], label [[FOR2_HEADER]]
-; CHECK: for1.latch:
-; CHECK-NEXT: [[INDVAR11]] = add nuw nsw i64 [[INDVAR1]], 1
-; CHECK-NEXT: [[FOR1_COND:%.*]] = icmp eq i64 [[INDVAR11]], 1000
-; CHECK-NEXT: br i1 [[FOR1_COND]], label [[EXIT]], label [[FOR1_HEADER]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK: exit:
-; CHECK-NEXT: ret void
-;
entry:
br label %for1.header
; CHECK-NEXT: [[TMP2:%.*]] = trunc <4 x i64> [[TMP1]] to <4 x i1>
; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP2]], <4 x double> [[BROADCAST_SPLAT]], <4 x double> [[BROADCAST_SPLAT3]]
; CHECK-NEXT: call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> [[TMP3]], <4 x double*> [[TMP0]], i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
-; CHECK-NEXT: [[TMP4]] = add nuw nsw <4 x i64> [[VEC_PHI]], <i64 1, i64 1, i64 1, i64 1>
-; CHECK-NEXT: [[TMP5:%.*]] = icmp eq <4 x i64> [[TMP4]], <i64 10000, i64 10000, i64 10000, i64 10000>
-; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0
-; CHECK-NEXT: br i1 [[TMP6]], label [[FOR1_LATCH4]], label [[FOR2_HEADER1]]
-; CHECK: for1.latch4:
-; CHECK-NEXT: [[TMP7:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>
-; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <4 x i64> [[TMP7]], <i64 1000, i64 1000, i64 1000, i64 1000>
-; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
-; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
-; CHECK: middle.block:
-; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, 1000
-; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK: scalar.ph:
-; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: br label [[FOR1_HEADER:%.*]]
-; CHECK: for1.header:
-; CHECK-NEXT: [[INDVAR1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVAR11:%.*]], [[FOR1_LATCH:%.*]] ]
-; CHECK-NEXT: [[PTR:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVAR1]]
-; CHECK-NEXT: br label [[FOR2_HEADER:%.*]]
-; CHECK: for2.header:
-; CHECK-NEXT: [[INDVAR2:%.*]] = phi i64 [ 0, [[FOR1_HEADER]] ], [ [[INDVAR21:%.*]], [[FOR2_HEADER]] ]
-; CHECK-NEXT: [[SUM:%.*]] = add nuw nsw i64 [[INDVAR1]], [[INDVAR2]]
-; CHECK-NEXT: [[SELECT:%.*]] = trunc i64 [[SUM]] to i1
-; CHECK-NEXT: [[SELECT_B:%.*]] = select i1 [[SELECT]], double [[A]], double [[B]]
-; CHECK-NEXT: store double [[SELECT_B]], double* [[PTR]], align 8
-; CHECK-NEXT: [[INDVAR21]] = add nuw nsw i64 [[INDVAR2]], 1
-; CHECK-NEXT: [[FOR2_COND:%.*]] = icmp eq i64 [[INDVAR21]], 10000
-; CHECK-NEXT: br i1 [[FOR2_COND]], label [[FOR1_LATCH]], label [[FOR2_HEADER]]
-; CHECK: for1.latch:
-; CHECK-NEXT: [[INDVAR11]] = add nuw nsw i64 [[INDVAR1]], 1
-; CHECK-NEXT: [[FOR1_COND:%.*]] = icmp eq i64 [[INDVAR11]], 1000
-; CHECK-NEXT: br i1 [[FOR1_COND]], label [[EXIT]], label [[FOR1_HEADER]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK: exit:
-; CHECK-NEXT: ret void
-;
entry:
br label %for1.header
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -passes=sroa -S | FileCheck %s
target datalayout = "e-p:64:64:64-p1:16:16:16-p3:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
; Make sure an illegal bitcast isn't introduced
define void @test_address_space_1_1(<2 x i64> addrspace(1)* %a, i16 addrspace(1)* %b) {
; CHECK-LABEL: @test_address_space_1_1(
-; CHECK-NEXT: [[AA_0_COPYLOAD:%.*]] = load <2 x i64>, <2 x i64> addrspace(1)* [[A:%.*]], align 2
-; CHECK-NEXT: [[AA_0_BPTR_SROA_CAST:%.*]] = bitcast i16 addrspace(1)* [[B:%.*]] to <2 x i64> addrspace(1)*
-; CHECK-NEXT: store <2 x i64> [[AA_0_COPYLOAD]], <2 x i64> addrspace(1)* [[AA_0_BPTR_SROA_CAST]], align 2
-; CHECK-NEXT: ret void
-;
+; CHECK: load <2 x i64>, <2 x i64> addrspace(1)* %a, align 2
+; CHECK: store <2 x i64> {{.*}}, <2 x i64> addrspace(1)* {{.*}}, align 2
+; CHECK: ret void
%aa = alloca <2 x i64>, align 16
%aptr = bitcast <2 x i64> addrspace(1)* %a to i8 addrspace(1)*
%aaptr = bitcast <2 x i64>* %aa to i8*
define void @test_address_space_1_0(<2 x i64> addrspace(1)* %a, i16* %b) {
; CHECK-LABEL: @test_address_space_1_0(
-; CHECK-NEXT: [[AA_0_COPYLOAD:%.*]] = load <2 x i64>, <2 x i64> addrspace(1)* [[A:%.*]], align 2
-; CHECK-NEXT: [[AA_0_BPTR_SROA_CAST:%.*]] = bitcast i16* [[B:%.*]] to <2 x i64>*
-; CHECK-NEXT: store <2 x i64> [[AA_0_COPYLOAD]], <2 x i64>* [[AA_0_BPTR_SROA_CAST]], align 2
-; CHECK-NEXT: ret void
-;
+; CHECK: load <2 x i64>, <2 x i64> addrspace(1)* %a, align 2
+; CHECK: store <2 x i64> {{.*}}, <2 x i64>* {{.*}}, align 2
+; CHECK: ret void
%aa = alloca <2 x i64>, align 16
%aptr = bitcast <2 x i64> addrspace(1)* %a to i8 addrspace(1)*
%aaptr = bitcast <2 x i64>* %aa to i8*
define void @test_address_space_0_1(<2 x i64>* %a, i16 addrspace(1)* %b) {
; CHECK-LABEL: @test_address_space_0_1(
-; CHECK-NEXT: [[AA_0_COPYLOAD:%.*]] = load <2 x i64>, <2 x i64>* [[A:%.*]], align 2
-; CHECK-NEXT: [[AA_0_BPTR_SROA_CAST:%.*]] = bitcast i16 addrspace(1)* [[B:%.*]] to <2 x i64> addrspace(1)*
-; CHECK-NEXT: store <2 x i64> [[AA_0_COPYLOAD]], <2 x i64> addrspace(1)* [[AA_0_BPTR_SROA_CAST]], align 2
-; CHECK-NEXT: ret void
-;
+; CHECK: load <2 x i64>, <2 x i64>* %a, align 2
+; CHECK: store <2 x i64> {{.*}}, <2 x i64> addrspace(1)* {{.*}}, align 2
+; CHECK: ret void
%aa = alloca <2 x i64>, align 16
%aptr = bitcast <2 x i64>* %a to i8*
%aaptr = bitcast <2 x i64>* %aa to i8*
; Function Attrs: nounwind
define void @copy_struct([5 x i64] %in.coerce) {
; CHECK-LABEL: @copy_struct(
-; CHECK-NEXT: for.end:
-; CHECK-NEXT: [[IN_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [5 x i64] [[IN_COERCE:%.*]], 0
-; CHECK-NEXT: [[IN_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [5 x i64] [[IN_COERCE]], 1
-; CHECK-NEXT: [[IN_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [5 x i64] [[IN_COERCE]], 2
-; CHECK-NEXT: [[IN_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [5 x i64] [[IN_COERCE]], 3
-; CHECK-NEXT: [[IN_SROA_2_4_EXTRACT_SHIFT:%.*]] = lshr i64 [[IN_COERCE_FCA_2_EXTRACT]], 32
-; CHECK-NEXT: [[IN_SROA_2_4_EXTRACT_TRUNC:%.*]] = trunc i64 [[IN_SROA_2_4_EXTRACT_SHIFT]] to i32
-; CHECK-NEXT: store i32 [[IN_SROA_2_4_EXTRACT_TRUNC]], i32 addrspace(1)* undef, align 4
-; CHECK-NEXT: store i64 [[IN_COERCE_FCA_3_EXTRACT]], i64 addrspace(1)* poison, align 4
-; CHECK-NEXT: store i32 undef, i32 addrspace(1)* poison, align 4
-; CHECK-NEXT: ret void
-;
+; CHECK-NOT: memcpy
for.end:
%in = alloca %struct.struct_test_27.0.13, align 8
%0 = bitcast %struct.struct_test_27.0.13* %in to [5 x i64]*
call void @llvm.memcpy.p1i8.p0i8.i32(i8 addrspace(1)* align 4 undef, i8* align 4 %scevgep910, i32 16, i1 false)
ret void
}
-
+
%union.anon = type { i32* }
@g = common global i32 0, align 4
; illegal bitcast isn't introduced
define void @pr27557() {
; CHECK-LABEL: @pr27557(
-; CHECK-NEXT: [[DOTSROA_0:%.*]] = alloca i32*, align 8
-; CHECK-NEXT: store i32* @g, i32** [[DOTSROA_0]], align 8
-; CHECK-NEXT: [[DOTSROA_0_0__SROA_CAST1:%.*]] = bitcast i32** [[DOTSROA_0]] to i32 addrspace(3)**
-; CHECK-NEXT: store i32 addrspace(3)* @l, i32 addrspace(3)** [[DOTSROA_0_0__SROA_CAST1]], align 8
-; CHECK-NEXT: ret void
-;
+; CHECK: %[[CAST:.*]] = bitcast i32** {{.*}} to i32 addrspace(3)**
+; CHECK: store i32 addrspace(3)* @l, i32 addrspace(3)** %[[CAST]]
%1 = alloca %union.anon, align 8
%2 = bitcast %union.anon* %1 to i32**
store i32* @g, i32** %2, align 8
; should be promoted through the pair of `ptrtoint`/`inttoptr`.
define i32* @pr27557.alt() {
; CHECK-LABEL: @pr27557.alt(
-; CHECK-NEXT: ret i32* inttoptr (i64 ptrtoint (i32 addrspace(2)* @l2 to i64) to i32*)
-;
+; CHECK: ret i32* inttoptr (i64 ptrtoint (i32 addrspace(2)* @l2 to i64) to i32*)
%1 = alloca %union.anon, align 8
%2 = bitcast %union.anon* %1 to i32 addrspace(2)**
store i32 addrspace(2)* @l2, i32 addrspace(2)** %2, align 8
; Make sure pre-splitting doesn't try to introduce an illegal bitcast
define float @presplit(i64 addrspace(1)* %p) {
-; CHECK-LABEL: @presplit(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[P_SROA_CAST:%.*]] = bitcast i64 addrspace(1)* [[P:%.*]] to i32 addrspace(1)*
-; CHECK-NEXT: [[L1:%.*]] = load i32, i32 addrspace(1)* [[P_SROA_CAST]], align 4
-; CHECK-NEXT: [[P_SROA_RAW_CAST:%.*]] = bitcast i64 addrspace(1)* [[P]] to i8 addrspace(1)*
-; CHECK-NEXT: [[P_SROA_RAW_IDX:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[P_SROA_RAW_CAST]], i16 4
-; CHECK-NEXT: [[P_SROA_CAST2:%.*]] = bitcast i8 addrspace(1)* [[P_SROA_RAW_IDX]] to i32 addrspace(1)*
-; CHECK-NEXT: [[L3:%.*]] = load i32, i32 addrspace(1)* [[P_SROA_CAST2]], align 4
-; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32 [[L1]] to float
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[L3]] to float
-; CHECK-NEXT: [[RET:%.*]] = fadd float [[TMP0]], [[TMP1]]
-; CHECK-NEXT: ret float [[RET]]
-;
entry:
- %b = alloca i64
- %b.cast = bitcast i64* %b to [2 x float]*
- %b.gep1 = getelementptr [2 x float], [2 x float]* %b.cast, i32 0, i32 0
- %b.gep2 = getelementptr [2 x float], [2 x float]* %b.cast, i32 0, i32 1
- %l = load i64, i64 addrspace(1)* %p
- store i64 %l, i64* %b
- %f1 = load float, float* %b.gep1
- %f2 = load float, float* %b.gep2
- %ret = fadd float %f1, %f2
- ret float %ret
+; CHECK-LABEL: @presplit(
+; CHECK: %[[CAST:.*]] = bitcast i64 addrspace(1)* {{.*}} to i32 addrspace(1)*
+; CHECK: load i32, i32 addrspace(1)* %[[CAST]]
+ %b = alloca i64
+ %b.cast = bitcast i64* %b to [2 x float]*
+ %b.gep1 = getelementptr [2 x float], [2 x float]* %b.cast, i32 0, i32 0
+ %b.gep2 = getelementptr [2 x float], [2 x float]* %b.cast, i32 0, i32 1
+ %l = load i64, i64 addrspace(1)* %p
+ store i64 %l, i64* %b
+ %f1 = load float, float* %b.gep1
+ %f2 = load float, float* %b.gep2
+ %ret = fadd float %f1, %f2
+ ret float %ret
}
; Test load from and store to non-zero address space.
define void @test_load_store_diff_addr_space([2 x float] addrspace(1)* %complex1, [2 x float] addrspace(1)* %complex2) {
-; CHECK-LABEL: @test_load_store_diff_addr_space(
-; CHECK-NEXT: [[P1_SROA_CAST:%.*]] = bitcast [2 x float] addrspace(1)* [[COMPLEX1:%.*]] to i32 addrspace(1)*
-; CHECK-NEXT: [[V15:%.*]] = load i32, i32 addrspace(1)* [[P1_SROA_CAST]], align 4
-; CHECK-NEXT: [[P1_SROA_IDX:%.*]] = getelementptr inbounds [2 x float], [2 x float] addrspace(1)* [[COMPLEX1]], i16 0, i16 1
-; CHECK-NEXT: [[P1_SROA_CAST7:%.*]] = bitcast float addrspace(1)* [[P1_SROA_IDX]] to i32 addrspace(1)*
-; CHECK-NEXT: [[V18:%.*]] = load i32, i32 addrspace(1)* [[P1_SROA_CAST7]], align 4
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[V15]] to float
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32 [[V18]] to float
-; CHECK-NEXT: [[SUM:%.*]] = fadd float [[TMP1]], [[TMP2]]
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast float [[SUM]] to i32
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast float [[SUM]] to i32
-; CHECK-NEXT: [[P2_SROA_CAST:%.*]] = bitcast [2 x float] addrspace(1)* [[COMPLEX2:%.*]] to i32 addrspace(1)*
-; CHECK-NEXT: store i32 [[TMP3]], i32 addrspace(1)* [[P2_SROA_CAST]], align 4
-; CHECK-NEXT: [[P2_SROA_IDX:%.*]] = getelementptr inbounds [2 x float], [2 x float] addrspace(1)* [[COMPLEX2]], i16 0, i16 1
-; CHECK-NEXT: [[P2_SROA_CAST4:%.*]] = bitcast float addrspace(1)* [[P2_SROA_IDX]] to i32 addrspace(1)*
-; CHECK-NEXT: store i32 [[TMP4]], i32 addrspace(1)* [[P2_SROA_CAST4]], align 4
-; CHECK-NEXT: ret void
-;
+; CHECK-LABEL: @test_load_store_diff_addr_space
+; CHECK-NOT: alloca
+; CHECK: load i32, i32 addrspace(1)*
+; CHECK: load i32, i32 addrspace(1)*
+; CHECK: store i32 %{{.*}}, i32 addrspace(1)*
+; CHECK: store i32 %{{.*}}, i32 addrspace(1)*
%a = alloca i64
%a.cast = bitcast i64* %a to [2 x float]*
%a.gep1 = getelementptr [2 x float], [2 x float]* %a.cast, i32 0, i32 0
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -passes=sroa -S | FileCheck %s
; RUN: opt -passes='debugify,function(sroa)' -S < %s | FileCheck %s -check-prefix DEBUGLOC
define void @test1({ i8, i8 }* %a, { i8, i8 }* %b) {
; CHECK-LABEL: @test1(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[ALLOCA_SROA_0_0_GEP_A_SROA_IDX:%.*]] = getelementptr inbounds { i8, i8 }, { i8, i8 }* [[A:%.*]], i64 0, i32 0
-; CHECK-NEXT: [[ALLOCA_SROA_0_0_COPYLOAD:%.*]] = load i8, i8* [[ALLOCA_SROA_0_0_GEP_A_SROA_IDX]], align 16
-; CHECK-NEXT: [[ALLOCA_SROA_3_0_GEP_A_SROA_IDX:%.*]] = getelementptr inbounds { i8, i8 }, { i8, i8 }* [[A]], i64 0, i32 1
-; CHECK-NEXT: [[ALLOCA_SROA_3_0_COPYLOAD:%.*]] = load i8, i8* [[ALLOCA_SROA_3_0_GEP_A_SROA_IDX]], align 1
-; CHECK-NEXT: [[ALLOCA_SROA_0_0_GEP_B_SROA_IDX:%.*]] = getelementptr inbounds { i8, i8 }, { i8, i8 }* [[B:%.*]], i64 0, i32 0
-; CHECK-NEXT: store i8 [[ALLOCA_SROA_0_0_COPYLOAD]], i8* [[ALLOCA_SROA_0_0_GEP_B_SROA_IDX]], align 16
-; CHECK-NEXT: [[ALLOCA_SROA_3_0_GEP_B_SROA_IDX:%.*]] = getelementptr inbounds { i8, i8 }, { i8, i8 }* [[B]], i64 0, i32 1
-; CHECK-NEXT: store i8 [[ALLOCA_SROA_3_0_COPYLOAD]], i8* [[ALLOCA_SROA_3_0_GEP_B_SROA_IDX]], align 1
-; CHECK-NEXT: ret void
-;
-; DEBUGLOC-LABEL: @test1(
-; DEBUGLOC-NEXT: entry:
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata { i8, i8 }* undef, metadata [[META9:![0-9]+]], metadata !DIExpression()), !dbg [[DBG14:![0-9]+]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata i8* undef, metadata [[META11:![0-9]+]], metadata !DIExpression()), !dbg [[DBG15:![0-9]+]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata i8* undef, metadata [[META12:![0-9]+]], metadata !DIExpression()), !dbg [[DBG16:![0-9]+]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata i8* undef, metadata [[META13:![0-9]+]], metadata !DIExpression()), !dbg [[DBG17:![0-9]+]]
-; DEBUGLOC-NEXT: [[ALLOCA_SROA_0_0_GEP_A_SROA_IDX:%.*]] = getelementptr inbounds { i8, i8 }, { i8, i8 }* [[A:%.*]], i64 0, i32 0, !dbg [[DBG18:![0-9]+]]
-; DEBUGLOC-NEXT: [[ALLOCA_SROA_0_0_COPYLOAD:%.*]] = load i8, i8* [[ALLOCA_SROA_0_0_GEP_A_SROA_IDX]], align 16, !dbg [[DBG18]]
-; DEBUGLOC-NEXT: [[ALLOCA_SROA_3_0_GEP_A_SROA_IDX:%.*]] = getelementptr inbounds { i8, i8 }, { i8, i8 }* [[A]], i64 0, i32 1, !dbg [[DBG18]]
-; DEBUGLOC-NEXT: [[ALLOCA_SROA_3_0_COPYLOAD:%.*]] = load i8, i8* [[ALLOCA_SROA_3_0_GEP_A_SROA_IDX]], align 1, !dbg [[DBG18]]
-; DEBUGLOC-NEXT: [[ALLOCA_SROA_0_0_GEP_B_SROA_IDX:%.*]] = getelementptr inbounds { i8, i8 }, { i8, i8 }* [[B:%.*]], i64 0, i32 0, !dbg [[DBG19:![0-9]+]]
-; DEBUGLOC-NEXT: store i8 [[ALLOCA_SROA_0_0_COPYLOAD]], i8* [[ALLOCA_SROA_0_0_GEP_B_SROA_IDX]], align 16, !dbg [[DBG19]]
-; DEBUGLOC-NEXT: [[ALLOCA_SROA_3_0_GEP_B_SROA_IDX:%.*]] = getelementptr inbounds { i8, i8 }, { i8, i8 }* [[B]], i64 0, i32 1, !dbg [[DBG19]]
-; DEBUGLOC-NEXT: store i8 [[ALLOCA_SROA_3_0_COPYLOAD]], i8* [[ALLOCA_SROA_3_0_GEP_B_SROA_IDX]], align 1, !dbg [[DBG19]]
-; DEBUGLOC-NEXT: ret void, !dbg [[DBG20:![0-9]+]]
-;
+; CHECK: %[[gep_a0:.*]] = getelementptr inbounds { i8, i8 }, { i8, i8 }* %a, i64 0, i32 0
+; CHECK: %[[a0:.*]] = load i8, i8* %[[gep_a0]], align 16
+; CHECK: %[[gep_a1:.*]] = getelementptr inbounds { i8, i8 }, { i8, i8 }* %a, i64 0, i32 1
+; CHECK: %[[a1:.*]] = load i8, i8* %[[gep_a1]], align 1
+; CHECK: %[[gep_b0:.*]] = getelementptr inbounds { i8, i8 }, { i8, i8 }* %b, i64 0, i32 0
+; CHECK: store i8 %[[a0]], i8* %[[gep_b0]], align 16
+; CHECK: %[[gep_b1:.*]] = getelementptr inbounds { i8, i8 }, { i8, i8 }* %b, i64 0, i32 1
+; CHECK: store i8 %[[a1]], i8* %[[gep_b1]], align 1
+; CHECK: ret void
entry:
%alloca = alloca { i8, i8 }, align 16
define void @test2() {
; CHECK-LABEL: @test2(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[A_SROA_0:%.*]] = alloca i16, align 2
-; CHECK-NEXT: store volatile i16 0, i16* [[A_SROA_0]], align 2
-; CHECK-NEXT: [[A_SROA_0_1_GEP2_SROA_RAW_CAST:%.*]] = bitcast i16* [[A_SROA_0]] to i8*
-; CHECK-NEXT: [[A_SROA_0_1_GEP2_SROA_RAW_IDX:%.*]] = getelementptr inbounds i8, i8* [[A_SROA_0_1_GEP2_SROA_RAW_CAST]], i64 1
-; CHECK-NEXT: [[A_SROA_0_1_A_SROA_0_2_RESULT:%.*]] = load i8, i8* [[A_SROA_0_1_GEP2_SROA_RAW_IDX]], align 1
-; CHECK-NEXT: [[A_SROA_0_1_GEP2_SROA_RAW_CAST3:%.*]] = bitcast i16* [[A_SROA_0]] to i8*
-; CHECK-NEXT: [[A_SROA_0_1_GEP2_SROA_RAW_IDX4:%.*]] = getelementptr inbounds i8, i8* [[A_SROA_0_1_GEP2_SROA_RAW_CAST3]], i64 1
-; CHECK-NEXT: store i8 42, i8* [[A_SROA_0_1_GEP2_SROA_RAW_IDX4]], align 1
-; CHECK-NEXT: ret void
-;
-; DEBUGLOC-LABEL: @test2(
-; DEBUGLOC-NEXT: entry:
-; DEBUGLOC-NEXT: [[A_SROA_0:%.*]] = alloca i16, align 2, !dbg [[DBG29:![0-9]+]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata { i8, i8, i8, i8 }* undef, metadata [[META23:![0-9]+]], metadata !DIExpression()), !dbg [[DBG29]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata i8* undef, metadata [[META24:![0-9]+]], metadata !DIExpression()), !dbg [[DBG30:![0-9]+]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata i16* undef, metadata [[META25:![0-9]+]], metadata !DIExpression()), !dbg [[DBG31:![0-9]+]]
-; DEBUGLOC-NEXT: store volatile i16 0, i16* [[A_SROA_0]], align 2, !dbg [[DBG32:![0-9]+]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata i8* undef, metadata [[META26:![0-9]+]], metadata !DIExpression()), !dbg [[DBG33:![0-9]+]]
-; DEBUGLOC-NEXT: [[A_SROA_0_1_GEP2_SROA_RAW_CAST:%.*]] = bitcast i16* [[A_SROA_0]] to i8*, !dbg [[DBG34:![0-9]+]]
-; DEBUGLOC-NEXT: [[A_SROA_0_1_GEP2_SROA_RAW_IDX:%.*]] = getelementptr inbounds i8, i8* [[A_SROA_0_1_GEP2_SROA_RAW_CAST]], i64 1, !dbg [[DBG34]]
-; DEBUGLOC-NEXT: [[A_SROA_0_1_A_SROA_0_2_RESULT:%.*]] = load i8, i8* [[A_SROA_0_1_GEP2_SROA_RAW_IDX]], align 1, !dbg [[DBG34]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata i8 [[A_SROA_0_1_A_SROA_0_2_RESULT]], metadata [[META27:![0-9]+]], metadata !DIExpression()), !dbg [[DBG34]]
-; DEBUGLOC-NEXT: [[A_SROA_0_1_GEP2_SROA_RAW_CAST3:%.*]] = bitcast i16* [[A_SROA_0]] to i8*, !dbg [[DBG35:![0-9]+]]
-; DEBUGLOC-NEXT: [[A_SROA_0_1_GEP2_SROA_RAW_IDX4:%.*]] = getelementptr inbounds i8, i8* [[A_SROA_0_1_GEP2_SROA_RAW_CAST3]], i64 1, !dbg [[DBG35]]
-; DEBUGLOC-NEXT: store i8 42, i8* [[A_SROA_0_1_GEP2_SROA_RAW_IDX4]], align 1, !dbg [[DBG35]]
-; DEBUGLOC-NEXT: ret void, !dbg [[DBG36:![0-9]+]]
-;
+; CHECK: alloca i16
+; CHECK: load i8, i8* %{{.*}}
+; CHECK: store i8 42, i8* %{{.*}}
+; CHECK: ret void
; Check that when sroa rewrites the alloca partition
; it preserves the original DebugLocation.
+; DEBUGLOC-LABEL: @test2(
+; DEBUGLOC: {{.*}} = alloca {{.*}} !dbg ![[DbgLoc:[0-9]+]]
+; DEBUGLOC-LABEL: }
+;
+; DEBUGLOC: ![[DbgLoc]] = !DILocation(line: 9,
entry:
%a = alloca { i8, i8, i8, i8 }, align 2 ; "line 9" to -debugify
define void @PR13920(<2 x i64>* %a, i16* %b) {
; Test that alignments on memcpy intrinsics get propagated to loads and stores.
; CHECK-LABEL: @PR13920(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[AA_0_COPYLOAD:%.*]] = load <2 x i64>, <2 x i64>* [[A:%.*]], align 2
-; CHECK-NEXT: [[AA_0_BPTR_SROA_CAST:%.*]] = bitcast i16* [[B:%.*]] to <2 x i64>*
-; CHECK-NEXT: store <2 x i64> [[AA_0_COPYLOAD]], <2 x i64>* [[AA_0_BPTR_SROA_CAST]], align 2
-; CHECK-NEXT: ret void
-;
-; DEBUGLOC-LABEL: @PR13920(
-; DEBUGLOC-NEXT: entry:
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata <2 x i64>* undef, metadata [[META39:![0-9]+]], metadata !DIExpression()), !dbg [[DBG43:![0-9]+]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata i8* undef, metadata [[META40:![0-9]+]], metadata !DIExpression()), !dbg [[DBG44:![0-9]+]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata i8* undef, metadata [[META41:![0-9]+]], metadata !DIExpression()), !dbg [[DBG45:![0-9]+]]
-; DEBUGLOC-NEXT: [[AA_0_COPYLOAD:%.*]] = load <2 x i64>, <2 x i64>* [[A:%.*]], align 2, !dbg [[DBG46:![0-9]+]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata i8* undef, metadata [[META42:![0-9]+]], metadata !DIExpression()), !dbg [[DBG47:![0-9]+]]
-; DEBUGLOC-NEXT: [[AA_0_BPTR_SROA_CAST:%.*]] = bitcast i16* [[B:%.*]] to <2 x i64>*, !dbg [[DBG48:![0-9]+]]
-; DEBUGLOC-NEXT: store <2 x i64> [[AA_0_COPYLOAD]], <2 x i64>* [[AA_0_BPTR_SROA_CAST]], align 2, !dbg [[DBG48]]
-; DEBUGLOC-NEXT: ret void, !dbg [[DBG49:![0-9]+]]
-;
+; CHECK: load <2 x i64>, <2 x i64>* %a, align 2
+; CHECK: store <2 x i64> {{.*}}, <2 x i64>* {{.*}}, align 2
+; CHECK: ret void
entry:
%aa = alloca <2 x i64>, align 16
; expecting. However, also check that any offset within an alloca can in turn
; reduce the alignment.
; CHECK-LABEL: @test3(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[A_SROA_0:%.*]] = alloca [22 x i8], align 8
-; CHECK-NEXT: [[B_SROA_0:%.*]] = alloca [18 x i8], align 2
-; CHECK-NEXT: [[A_SROA_0_0_A_RAW_SROA_IDX:%.*]] = getelementptr inbounds [22 x i8], [22 x i8]* [[A_SROA_0]], i64 0, i64 0
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[A_SROA_0_0_A_RAW_SROA_IDX]], i8* align 8 [[X:%.*]], i32 22, i1 false)
-; CHECK-NEXT: [[B_SROA_0_6_B_GEP_SROA_IDX:%.*]] = getelementptr inbounds [18 x i8], [18 x i8]* [[B_SROA_0]], i64 0, i64 0
-; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[B_SROA_0_6_B_GEP_SROA_IDX]], i8* align 2 [[X]], i32 18, i1 false)
-; CHECK-NEXT: ret void
-;
-; DEBUGLOC-LABEL: @test3(
-; DEBUGLOC-NEXT: entry:
-; DEBUGLOC-NEXT: [[A_SROA_0:%.*]] = alloca [22 x i8], align 8, !dbg [[DBG57:![0-9]+]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata { i8*, i8*, i8* }* undef, metadata [[META52:![0-9]+]], metadata !DIExpression()), !dbg [[DBG57]]
-; DEBUGLOC-NEXT: [[B_SROA_0:%.*]] = alloca [18 x i8], align 2, !dbg [[DBG58:![0-9]+]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata { i8*, i8*, i8* }* undef, metadata [[META53:![0-9]+]], metadata !DIExpression()), !dbg [[DBG58]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata i8* undef, metadata [[META54:![0-9]+]], metadata !DIExpression()), !dbg [[DBG59:![0-9]+]]
-; DEBUGLOC-NEXT: [[A_SROA_0_0_A_RAW_SROA_IDX:%.*]] = getelementptr inbounds [22 x i8], [22 x i8]* [[A_SROA_0]], i64 0, i64 0, !dbg [[DBG60:![0-9]+]]
-; DEBUGLOC-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 [[A_SROA_0_0_A_RAW_SROA_IDX]], i8* align 8 [[X:%.*]], i32 22, i1 false), !dbg [[DBG60]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata i8* undef, metadata [[META55:![0-9]+]], metadata !DIExpression()), !dbg [[DBG61:![0-9]+]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata i8* undef, metadata [[META56:![0-9]+]], metadata !DIExpression()), !dbg [[DBG62:![0-9]+]]
-; DEBUGLOC-NEXT: [[B_SROA_0_6_B_GEP_SROA_IDX:%.*]] = getelementptr inbounds [18 x i8], [18 x i8]* [[B_SROA_0]], i64 0, i64 0, !dbg [[DBG63:![0-9]+]]
-; DEBUGLOC-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 2 [[B_SROA_0_6_B_GEP_SROA_IDX]], i8* align 2 [[X]], i32 18, i1 false), !dbg [[DBG63]]
-; DEBUGLOC-NEXT: ret void, !dbg [[DBG64:![0-9]+]]
-;
+; CHECK: alloca [22 x i8], align 8
+; CHECK: alloca [18 x i8], align 2
+; CHECK: ret void
entry:
%a = alloca { i8*, i8*, i8* }
; split or promoted out of existence.
;
; CHECK-LABEL: @test5(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[A_SROA_0:%.*]] = alloca [9 x i8], align 1
-; CHECK-NEXT: [[A_SROA_3:%.*]] = alloca [9 x i8], align 1
-; CHECK-NEXT: [[A_SROA_0_0_PTR1_SROA_CAST2:%.*]] = bitcast [9 x i8]* [[A_SROA_0]] to double*
-; CHECK-NEXT: store volatile double 0.000000e+00, double* [[A_SROA_0_0_PTR1_SROA_CAST2]], align 1
-; CHECK-NEXT: [[A_SROA_0_7_WEIRD_CAST1_SROA_IDX4:%.*]] = getelementptr inbounds [9 x i8], [9 x i8]* [[A_SROA_0]], i64 0, i64 7
-; CHECK-NEXT: [[A_SROA_0_7_WEIRD_CAST1_SROA_CAST5:%.*]] = bitcast i8* [[A_SROA_0_7_WEIRD_CAST1_SROA_IDX4]] to i16*
-; CHECK-NEXT: [[A_SROA_0_7_A_SROA_0_7_WEIRD_LOAD1:%.*]] = load volatile i16, i16* [[A_SROA_0_7_WEIRD_CAST1_SROA_CAST5]], align 1
-; CHECK-NEXT: [[A_SROA_0_0_PTR1_SROA_CAST3:%.*]] = bitcast [9 x i8]* [[A_SROA_0]] to double*
-; CHECK-NEXT: [[A_SROA_0_0_A_SROA_0_0_D1:%.*]] = load double, double* [[A_SROA_0_0_PTR1_SROA_CAST3]], align 1
-; CHECK-NEXT: [[A_SROA_3_0_PTR2_SROA_CAST:%.*]] = bitcast [9 x i8]* [[A_SROA_3]] to double*
-; CHECK-NEXT: store volatile double [[A_SROA_0_0_A_SROA_0_0_D1]], double* [[A_SROA_3_0_PTR2_SROA_CAST]], align 1
-; CHECK-NEXT: [[A_SROA_3_7_WEIRD_CAST2_SROA_IDX:%.*]] = getelementptr inbounds [9 x i8], [9 x i8]* [[A_SROA_3]], i64 0, i64 7
-; CHECK-NEXT: [[A_SROA_3_7_WEIRD_CAST2_SROA_CAST:%.*]] = bitcast i8* [[A_SROA_3_7_WEIRD_CAST2_SROA_IDX]] to i16*
-; CHECK-NEXT: [[A_SROA_3_7_A_SROA_3_16_WEIRD_LOAD2:%.*]] = load volatile i16, i16* [[A_SROA_3_7_WEIRD_CAST2_SROA_CAST]], align 1
-; CHECK-NEXT: ret void
-;
-; DEBUGLOC-LABEL: @test5(
-; DEBUGLOC-NEXT: entry:
-; DEBUGLOC-NEXT: [[A_SROA_0:%.*]] = alloca [9 x i8], align 1, !dbg [[DBG80:![0-9]+]]
-; DEBUGLOC-NEXT: [[A_SROA_3:%.*]] = alloca [9 x i8], align 1, !dbg [[DBG80]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata [18 x i8]* undef, metadata [[META67:![0-9]+]], metadata !DIExpression()), !dbg [[DBG80]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata i8* undef, metadata [[META68:![0-9]+]], metadata !DIExpression()), !dbg [[DBG81:![0-9]+]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata double* undef, metadata [[META69:![0-9]+]], metadata !DIExpression()), !dbg [[DBG82:![0-9]+]]
-; DEBUGLOC-NEXT: [[A_SROA_0_0_PTR1_SROA_CAST2:%.*]] = bitcast [9 x i8]* [[A_SROA_0]] to double*, !dbg [[DBG83:![0-9]+]]
-; DEBUGLOC-NEXT: store volatile double 0.000000e+00, double* [[A_SROA_0_0_PTR1_SROA_CAST2]], align 1, !dbg [[DBG83]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata i8* undef, metadata [[META70:![0-9]+]], metadata !DIExpression()), !dbg [[DBG84:![0-9]+]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata i16* undef, metadata [[META71:![0-9]+]], metadata !DIExpression()), !dbg [[DBG85:![0-9]+]]
-; DEBUGLOC-NEXT: [[A_SROA_0_7_WEIRD_CAST1_SROA_IDX4:%.*]] = getelementptr inbounds [9 x i8], [9 x i8]* [[A_SROA_0]], i64 0, i64 7, !dbg [[DBG86:![0-9]+]]
-; DEBUGLOC-NEXT: [[A_SROA_0_7_WEIRD_CAST1_SROA_CAST5:%.*]] = bitcast i8* [[A_SROA_0_7_WEIRD_CAST1_SROA_IDX4]] to i16*, !dbg [[DBG86]]
-; DEBUGLOC-NEXT: [[A_SROA_0_7_A_SROA_0_7_WEIRD_LOAD1:%.*]] = load volatile i16, i16* [[A_SROA_0_7_WEIRD_CAST1_SROA_CAST5]], align 1, !dbg [[DBG86]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata i16 [[A_SROA_0_7_A_SROA_0_7_WEIRD_LOAD1]], metadata [[META72:![0-9]+]], metadata !DIExpression()), !dbg [[DBG86]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata i8* undef, metadata [[META74:![0-9]+]], metadata !DIExpression()), !dbg [[DBG87:![0-9]+]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata double* undef, metadata [[META75:![0-9]+]], metadata !DIExpression()), !dbg [[DBG88:![0-9]+]]
-; DEBUGLOC-NEXT: [[A_SROA_0_0_PTR1_SROA_CAST3:%.*]] = bitcast [9 x i8]* [[A_SROA_0]] to double*, !dbg [[DBG89:![0-9]+]]
-; DEBUGLOC-NEXT: [[A_SROA_0_0_A_SROA_0_0_D1:%.*]] = load double, double* [[A_SROA_0_0_PTR1_SROA_CAST3]], align 1, !dbg [[DBG89]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata double [[A_SROA_0_0_A_SROA_0_0_D1]], metadata [[META76:![0-9]+]], metadata !DIExpression()), !dbg [[DBG89]]
-; DEBUGLOC-NEXT: [[A_SROA_3_0_PTR2_SROA_CAST:%.*]] = bitcast [9 x i8]* [[A_SROA_3]] to double*, !dbg [[DBG90:![0-9]+]]
-; DEBUGLOC-NEXT: store volatile double [[A_SROA_0_0_A_SROA_0_0_D1]], double* [[A_SROA_3_0_PTR2_SROA_CAST]], align 1, !dbg [[DBG90]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata i8* undef, metadata [[META77:![0-9]+]], metadata !DIExpression()), !dbg [[DBG91:![0-9]+]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata i16* undef, metadata [[META78:![0-9]+]], metadata !DIExpression()), !dbg [[DBG92:![0-9]+]]
-; DEBUGLOC-NEXT: [[A_SROA_3_7_WEIRD_CAST2_SROA_IDX:%.*]] = getelementptr inbounds [9 x i8], [9 x i8]* [[A_SROA_3]], i64 0, i64 7, !dbg [[DBG93:![0-9]+]]
-; DEBUGLOC-NEXT: [[A_SROA_3_7_WEIRD_CAST2_SROA_CAST:%.*]] = bitcast i8* [[A_SROA_3_7_WEIRD_CAST2_SROA_IDX]] to i16*, !dbg [[DBG93]]
-; DEBUGLOC-NEXT: [[A_SROA_3_7_A_SROA_3_16_WEIRD_LOAD2:%.*]] = load volatile i16, i16* [[A_SROA_3_7_WEIRD_CAST2_SROA_CAST]], align 1, !dbg [[DBG93]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata i16 [[A_SROA_3_7_A_SROA_3_16_WEIRD_LOAD2]], metadata [[META79:![0-9]+]], metadata !DIExpression()), !dbg [[DBG93]]
-; DEBUGLOC-NEXT: ret void, !dbg [[DBG94:![0-9]+]]
-;
+; CHECK: alloca [9 x i8]
+; CHECK: alloca [9 x i8]
+; CHECK: store volatile double 0.0{{.*}}, double* %{{.*}}, align 1
+; CHECK: load volatile i16, i16* %{{.*}}, align 1
+; CHECK: load double, double* %{{.*}}, align 1
+; CHECK: store volatile double %{{.*}}, double* %{{.*}}, align 1
+; CHECK: load volatile i16, i16* %{{.*}}, align 1
+; CHECK: ret void
entry:
%a = alloca [18 x i8]
; We should set the alignment on all load and store operations; make sure
; we choose an appropriate alignment.
; CHECK-LABEL: @test6(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[A_SROA_0:%.*]] = alloca double, align 8
-; CHECK-NEXT: [[A_SROA_2:%.*]] = alloca double, align 8
-; CHECK-NEXT: store volatile double 0.000000e+00, double* [[A_SROA_0]], align 8
-; CHECK-NEXT: [[A_SROA_0_0_A_SROA_0_0_VAL:%.*]] = load double, double* [[A_SROA_0]], align 8
-; CHECK-NEXT: store volatile double [[A_SROA_0_0_A_SROA_0_0_VAL]], double* [[A_SROA_2]], align 8
-; CHECK-NEXT: ret void
-;
-; DEBUGLOC-LABEL: @test6(
-; DEBUGLOC-NEXT: entry:
-; DEBUGLOC-NEXT: [[A_SROA_0:%.*]] = alloca double, align 8, !dbg [[DBG103:![0-9]+]]
-; DEBUGLOC-NEXT: [[A_SROA_2:%.*]] = alloca double, align 8, !dbg [[DBG103]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata [16 x i8]* undef, metadata [[META97:![0-9]+]], metadata !DIExpression()), !dbg [[DBG103]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata i8* undef, metadata [[META98:![0-9]+]], metadata !DIExpression()), !dbg [[DBG104:![0-9]+]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata double* undef, metadata [[META99:![0-9]+]], metadata !DIExpression()), !dbg [[DBG105:![0-9]+]]
-; DEBUGLOC-NEXT: store volatile double 0.000000e+00, double* [[A_SROA_0]], align 8, !dbg [[DBG106:![0-9]+]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata i8* undef, metadata [[META100:![0-9]+]], metadata !DIExpression()), !dbg [[DBG107:![0-9]+]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata double* undef, metadata [[META101:![0-9]+]], metadata !DIExpression()), !dbg [[DBG108:![0-9]+]]
-; DEBUGLOC-NEXT: [[A_SROA_0_0_A_SROA_0_0_VAL:%.*]] = load double, double* [[A_SROA_0]], align 8, !dbg [[DBG109:![0-9]+]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata double [[A_SROA_0_0_A_SROA_0_0_VAL]], metadata [[META102:![0-9]+]], metadata !DIExpression()), !dbg [[DBG109]]
-; DEBUGLOC-NEXT: store volatile double [[A_SROA_0_0_A_SROA_0_0_VAL]], double* [[A_SROA_2]], align 8, !dbg [[DBG110:![0-9]+]]
-; DEBUGLOC-NEXT: ret void, !dbg [[DBG111:![0-9]+]]
-;
+; CHECK: alloca double, align 8{{$}}
+; CHECK: alloca double, align 8{{$}}
+; CHECK: store{{.*}}, align 8
+; CHECK: load{{.*}}, align 8
+; CHECK: store{{.*}}, align 8
+; CHECK-NOT: align
+; CHECK: ret void
entry:
%a = alloca [16 x i8]
; Test that we properly compute the destination alignment when rewriting
; memcpys as direct loads or stores.
; CHECK-LABEL: @test7(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[A_SROA_0_0_OUT_SROA_CAST:%.*]] = bitcast i8* [[OUT:%.*]] to double*
-; CHECK-NEXT: [[A_SROA_0_0_COPYLOAD:%.*]] = load double, double* [[A_SROA_0_0_OUT_SROA_CAST]], align 1
-; CHECK-NEXT: [[A_SROA_4_0_OUT_SROA_IDX:%.*]] = getelementptr inbounds i8, i8* [[OUT]], i64 8
-; CHECK-NEXT: [[A_SROA_4_0_OUT_SROA_CAST:%.*]] = bitcast i8* [[A_SROA_4_0_OUT_SROA_IDX]] to double*
-; CHECK-NEXT: [[A_SROA_4_0_COPYLOAD:%.*]] = load double, double* [[A_SROA_4_0_OUT_SROA_CAST]], align 1
-; CHECK-NEXT: [[A_SROA_0_0_OUT_SROA_CAST1:%.*]] = bitcast i8* [[OUT]] to double*
-; CHECK-NEXT: store double [[A_SROA_4_0_COPYLOAD]], double* [[A_SROA_0_0_OUT_SROA_CAST1]], align 1
-; CHECK-NEXT: [[A_SROA_4_0_OUT_SROA_IDX3:%.*]] = getelementptr inbounds i8, i8* [[OUT]], i64 8
-; CHECK-NEXT: [[A_SROA_4_0_OUT_SROA_CAST4:%.*]] = bitcast i8* [[A_SROA_4_0_OUT_SROA_IDX3]] to double*
-; CHECK-NEXT: store double [[A_SROA_0_0_COPYLOAD]], double* [[A_SROA_4_0_OUT_SROA_CAST4]], align 1
-; CHECK-NEXT: ret void
-;
-; DEBUGLOC-LABEL: @test7(
-; DEBUGLOC-NEXT: entry:
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata [16 x i8]* undef, metadata [[META114:![0-9]+]], metadata !DIExpression()), !dbg [[DBG121:![0-9]+]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata i8* undef, metadata [[META115:![0-9]+]], metadata !DIExpression()), !dbg [[DBG122:![0-9]+]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata double* undef, metadata [[META116:![0-9]+]], metadata !DIExpression()), !dbg [[DBG123:![0-9]+]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata i8* undef, metadata [[META117:![0-9]+]], metadata !DIExpression()), !dbg [[DBG124:![0-9]+]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata double* undef, metadata [[META118:![0-9]+]], metadata !DIExpression()), !dbg [[DBG125:![0-9]+]]
-; DEBUGLOC-NEXT: [[A_SROA_0_0_OUT_SROA_CAST:%.*]] = bitcast i8* [[OUT:%.*]] to double*, !dbg [[DBG126:![0-9]+]]
-; DEBUGLOC-NEXT: [[A_SROA_0_0_COPYLOAD:%.*]] = load double, double* [[A_SROA_0_0_OUT_SROA_CAST]], align 1, !dbg [[DBG126]]
-; DEBUGLOC-NEXT: [[A_SROA_4_0_OUT_SROA_IDX:%.*]] = getelementptr inbounds i8, i8* [[OUT]], i64 8, !dbg [[DBG126]]
-; DEBUGLOC-NEXT: [[A_SROA_4_0_OUT_SROA_CAST:%.*]] = bitcast i8* [[A_SROA_4_0_OUT_SROA_IDX]] to double*, !dbg [[DBG126]]
-; DEBUGLOC-NEXT: [[A_SROA_4_0_COPYLOAD:%.*]] = load double, double* [[A_SROA_4_0_OUT_SROA_CAST]], align 1, !dbg [[DBG126]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata double [[A_SROA_4_0_COPYLOAD]], metadata [[META119:![0-9]+]], metadata !DIExpression()), !dbg [[DBG127:![0-9]+]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata double [[A_SROA_0_0_COPYLOAD]], metadata [[META120:![0-9]+]], metadata !DIExpression()), !dbg [[DBG128:![0-9]+]]
-; DEBUGLOC-NEXT: [[A_SROA_0_0_OUT_SROA_CAST1:%.*]] = bitcast i8* [[OUT]] to double*, !dbg [[DBG129:![0-9]+]]
-; DEBUGLOC-NEXT: store double [[A_SROA_4_0_COPYLOAD]], double* [[A_SROA_0_0_OUT_SROA_CAST1]], align 1, !dbg [[DBG129]]
-; DEBUGLOC-NEXT: [[A_SROA_4_0_OUT_SROA_IDX3:%.*]] = getelementptr inbounds i8, i8* [[OUT]], i64 8, !dbg [[DBG129]]
-; DEBUGLOC-NEXT: [[A_SROA_4_0_OUT_SROA_CAST4:%.*]] = bitcast i8* [[A_SROA_4_0_OUT_SROA_IDX3]] to double*, !dbg [[DBG129]]
-; DEBUGLOC-NEXT: store double [[A_SROA_0_0_COPYLOAD]], double* [[A_SROA_4_0_OUT_SROA_CAST4]], align 1, !dbg [[DBG129]]
-; DEBUGLOC-NEXT: ret void, !dbg [[DBG130:![0-9]+]]
-;
+; CHECK-NOT: alloca
entry:
%a = alloca [16 x i8]
%ptr2 = bitcast i8* %raw2 to double*
call void @llvm.memcpy.p0i8.p0i8.i32(i8* %raw1, i8* %out, i32 16, i1 false)
+; CHECK: %[[val2:.*]] = load double, double* %{{.*}}, align 1
+; CHECK: %[[val1:.*]] = load double, double* %{{.*}}, align 1
%val1 = load double, double* %ptr2, align 1
%val2 = load double, double* %ptr1, align 1
store double %val2, double* %ptr2, align 1
call void @llvm.memcpy.p0i8.p0i8.i32(i8* %out, i8* %raw1, i32 16, i1 false)
+; CHECK: store double %[[val1]], double* %{{.*}}, align 1
+; CHECK: store double %[[val2]], double* %{{.*}}, align 1
ret void
+; CHECK: ret void
}
define void @test8() {
; CHECK-LABEL: @test8(
-; CHECK-NEXT: [[PTR:%.*]] = alloca [5 x i32], align 1
-; CHECK-NEXT: [[PTR_8:%.*]] = bitcast [5 x i32]* [[PTR]] to i8*
-; CHECK-NEXT: call void @populate(i8* [[PTR_8]])
-; CHECK-NEXT: [[VAL_FCA_0_GEP:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[PTR]], i32 0, i32 0
-; CHECK-NEXT: [[VAL_FCA_0_LOAD:%.*]] = load i32, i32* [[VAL_FCA_0_GEP]], align 1
-; CHECK-NEXT: [[VAL_FCA_0_INSERT:%.*]] = insertvalue [5 x i32] poison, i32 [[VAL_FCA_0_LOAD]], 0
-; CHECK-NEXT: [[VAL_FCA_1_GEP:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[PTR]], i32 0, i32 1
-; CHECK-NEXT: [[VAL_FCA_1_LOAD:%.*]] = load i32, i32* [[VAL_FCA_1_GEP]], align 1
-; CHECK-NEXT: [[VAL_FCA_1_INSERT:%.*]] = insertvalue [5 x i32] [[VAL_FCA_0_INSERT]], i32 [[VAL_FCA_1_LOAD]], 1
-; CHECK-NEXT: [[VAL_FCA_2_GEP:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[PTR]], i32 0, i32 2
-; CHECK-NEXT: [[VAL_FCA_2_LOAD:%.*]] = load i32, i32* [[VAL_FCA_2_GEP]], align 1
-; CHECK-NEXT: [[VAL_FCA_2_INSERT:%.*]] = insertvalue [5 x i32] [[VAL_FCA_1_INSERT]], i32 [[VAL_FCA_2_LOAD]], 2
-; CHECK-NEXT: [[VAL_FCA_3_GEP:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[PTR]], i32 0, i32 3
-; CHECK-NEXT: [[VAL_FCA_3_LOAD:%.*]] = load i32, i32* [[VAL_FCA_3_GEP]], align 1
-; CHECK-NEXT: [[VAL_FCA_3_INSERT:%.*]] = insertvalue [5 x i32] [[VAL_FCA_2_INSERT]], i32 [[VAL_FCA_3_LOAD]], 3
-; CHECK-NEXT: [[VAL_FCA_4_GEP:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[PTR]], i32 0, i32 4
-; CHECK-NEXT: [[VAL_FCA_4_LOAD:%.*]] = load i32, i32* [[VAL_FCA_4_GEP]], align 1
-; CHECK-NEXT: [[VAL_FCA_4_INSERT:%.*]] = insertvalue [5 x i32] [[VAL_FCA_3_INSERT]], i32 [[VAL_FCA_4_LOAD]], 4
-; CHECK-NEXT: ret void
-;
-; DEBUGLOC-LABEL: @test8(
-; DEBUGLOC-NEXT: [[PTR:%.*]] = alloca [5 x i32], align 1, !dbg [[DBG137:![0-9]+]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata [5 x i32]* [[PTR]], metadata [[META133:![0-9]+]], metadata !DIExpression()), !dbg [[DBG137]]
-; DEBUGLOC-NEXT: [[PTR_8:%.*]] = bitcast [5 x i32]* [[PTR]] to i8*, !dbg [[DBG138:![0-9]+]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata i8* [[PTR_8]], metadata [[META134:![0-9]+]], metadata !DIExpression()), !dbg [[DBG138]]
-; DEBUGLOC-NEXT: call void @populate(i8* [[PTR_8]]), !dbg [[DBG139:![0-9]+]]
-; DEBUGLOC-NEXT: [[VAL_FCA_0_GEP:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[PTR]], i32 0, i32 0, !dbg [[DBG140:![0-9]+]]
-; DEBUGLOC-NEXT: [[VAL_FCA_0_LOAD:%.*]] = load i32, i32* [[VAL_FCA_0_GEP]], align 1, !dbg [[DBG140]]
-; DEBUGLOC-NEXT: [[VAL_FCA_0_INSERT:%.*]] = insertvalue [5 x i32] poison, i32 [[VAL_FCA_0_LOAD]], 0, !dbg [[DBG140]]
-; DEBUGLOC-NEXT: [[VAL_FCA_1_GEP:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[PTR]], i32 0, i32 1, !dbg [[DBG140]]
-; DEBUGLOC-NEXT: [[VAL_FCA_1_LOAD:%.*]] = load i32, i32* [[VAL_FCA_1_GEP]], align 1, !dbg [[DBG140]]
-; DEBUGLOC-NEXT: [[VAL_FCA_1_INSERT:%.*]] = insertvalue [5 x i32] [[VAL_FCA_0_INSERT]], i32 [[VAL_FCA_1_LOAD]], 1, !dbg [[DBG140]]
-; DEBUGLOC-NEXT: [[VAL_FCA_2_GEP:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[PTR]], i32 0, i32 2, !dbg [[DBG140]]
-; DEBUGLOC-NEXT: [[VAL_FCA_2_LOAD:%.*]] = load i32, i32* [[VAL_FCA_2_GEP]], align 1, !dbg [[DBG140]]
-; DEBUGLOC-NEXT: [[VAL_FCA_2_INSERT:%.*]] = insertvalue [5 x i32] [[VAL_FCA_1_INSERT]], i32 [[VAL_FCA_2_LOAD]], 2, !dbg [[DBG140]]
-; DEBUGLOC-NEXT: [[VAL_FCA_3_GEP:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[PTR]], i32 0, i32 3, !dbg [[DBG140]]
-; DEBUGLOC-NEXT: [[VAL_FCA_3_LOAD:%.*]] = load i32, i32* [[VAL_FCA_3_GEP]], align 1, !dbg [[DBG140]]
-; DEBUGLOC-NEXT: [[VAL_FCA_3_INSERT:%.*]] = insertvalue [5 x i32] [[VAL_FCA_2_INSERT]], i32 [[VAL_FCA_3_LOAD]], 3, !dbg [[DBG140]]
-; DEBUGLOC-NEXT: [[VAL_FCA_4_GEP:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[PTR]], i32 0, i32 4, !dbg [[DBG140]]
-; DEBUGLOC-NEXT: [[VAL_FCA_4_LOAD:%.*]] = load i32, i32* [[VAL_FCA_4_GEP]], align 1, !dbg [[DBG140]]
-; DEBUGLOC-NEXT: [[VAL_FCA_4_INSERT:%.*]] = insertvalue [5 x i32] [[VAL_FCA_3_INSERT]], i32 [[VAL_FCA_4_LOAD]], 4, !dbg [[DBG140]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata [5 x i32] [[VAL_FCA_4_INSERT]], metadata [[META135:![0-9]+]], metadata !DIExpression()), !dbg [[DBG140]]
-; DEBUGLOC-NEXT: ret void, !dbg [[DBG141:![0-9]+]]
-;
+; CHECK: load i32, {{.*}}, align 1
+; CHECK: load i32, {{.*}}, align 1
+; CHECK: load i32, {{.*}}, align 1
+; CHECK: load i32, {{.*}}, align 1
+; CHECK: load i32, {{.*}}, align 1
%ptr = alloca [5 x i32], align 1
%ptr.8 = bitcast [5 x i32]* %ptr to i8*
define void @test9() {
; CHECK-LABEL: @test9(
-; CHECK-NEXT: [[PTR:%.*]] = alloca [5 x i32], align 8
-; CHECK-NEXT: [[PTR_8:%.*]] = bitcast [5 x i32]* [[PTR]] to i8*
-; CHECK-NEXT: call void @populate(i8* [[PTR_8]])
-; CHECK-NEXT: [[VAL_FCA_0_GEP:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[PTR]], i32 0, i32 0
-; CHECK-NEXT: [[VAL_FCA_0_LOAD:%.*]] = load i32, i32* [[VAL_FCA_0_GEP]], align 8
-; CHECK-NEXT: [[VAL_FCA_0_INSERT:%.*]] = insertvalue [5 x i32] poison, i32 [[VAL_FCA_0_LOAD]], 0
-; CHECK-NEXT: [[VAL_FCA_1_GEP:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[PTR]], i32 0, i32 1
-; CHECK-NEXT: [[VAL_FCA_1_LOAD:%.*]] = load i32, i32* [[VAL_FCA_1_GEP]], align 4
-; CHECK-NEXT: [[VAL_FCA_1_INSERT:%.*]] = insertvalue [5 x i32] [[VAL_FCA_0_INSERT]], i32 [[VAL_FCA_1_LOAD]], 1
-; CHECK-NEXT: [[VAL_FCA_2_GEP:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[PTR]], i32 0, i32 2
-; CHECK-NEXT: [[VAL_FCA_2_LOAD:%.*]] = load i32, i32* [[VAL_FCA_2_GEP]], align 8
-; CHECK-NEXT: [[VAL_FCA_2_INSERT:%.*]] = insertvalue [5 x i32] [[VAL_FCA_1_INSERT]], i32 [[VAL_FCA_2_LOAD]], 2
-; CHECK-NEXT: [[VAL_FCA_3_GEP:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[PTR]], i32 0, i32 3
-; CHECK-NEXT: [[VAL_FCA_3_LOAD:%.*]] = load i32, i32* [[VAL_FCA_3_GEP]], align 4
-; CHECK-NEXT: [[VAL_FCA_3_INSERT:%.*]] = insertvalue [5 x i32] [[VAL_FCA_2_INSERT]], i32 [[VAL_FCA_3_LOAD]], 3
-; CHECK-NEXT: [[VAL_FCA_4_GEP:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[PTR]], i32 0, i32 4
-; CHECK-NEXT: [[VAL_FCA_4_LOAD:%.*]] = load i32, i32* [[VAL_FCA_4_GEP]], align 8
-; CHECK-NEXT: [[VAL_FCA_4_INSERT:%.*]] = insertvalue [5 x i32] [[VAL_FCA_3_INSERT]], i32 [[VAL_FCA_4_LOAD]], 4
-; CHECK-NEXT: ret void
-;
-; DEBUGLOC-LABEL: @test9(
-; DEBUGLOC-NEXT: [[PTR:%.*]] = alloca [5 x i32], align 8, !dbg [[DBG147:![0-9]+]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata [5 x i32]* [[PTR]], metadata [[META144:![0-9]+]], metadata !DIExpression()), !dbg [[DBG147]]
-; DEBUGLOC-NEXT: [[PTR_8:%.*]] = bitcast [5 x i32]* [[PTR]] to i8*, !dbg [[DBG148:![0-9]+]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata i8* [[PTR_8]], metadata [[META145:![0-9]+]], metadata !DIExpression()), !dbg [[DBG148]]
-; DEBUGLOC-NEXT: call void @populate(i8* [[PTR_8]]), !dbg [[DBG149:![0-9]+]]
-; DEBUGLOC-NEXT: [[VAL_FCA_0_GEP:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[PTR]], i32 0, i32 0, !dbg [[DBG150:![0-9]+]]
-; DEBUGLOC-NEXT: [[VAL_FCA_0_LOAD:%.*]] = load i32, i32* [[VAL_FCA_0_GEP]], align 8, !dbg [[DBG150]]
-; DEBUGLOC-NEXT: [[VAL_FCA_0_INSERT:%.*]] = insertvalue [5 x i32] poison, i32 [[VAL_FCA_0_LOAD]], 0, !dbg [[DBG150]]
-; DEBUGLOC-NEXT: [[VAL_FCA_1_GEP:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[PTR]], i32 0, i32 1, !dbg [[DBG150]]
-; DEBUGLOC-NEXT: [[VAL_FCA_1_LOAD:%.*]] = load i32, i32* [[VAL_FCA_1_GEP]], align 4, !dbg [[DBG150]]
-; DEBUGLOC-NEXT: [[VAL_FCA_1_INSERT:%.*]] = insertvalue [5 x i32] [[VAL_FCA_0_INSERT]], i32 [[VAL_FCA_1_LOAD]], 1, !dbg [[DBG150]]
-; DEBUGLOC-NEXT: [[VAL_FCA_2_GEP:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[PTR]], i32 0, i32 2, !dbg [[DBG150]]
-; DEBUGLOC-NEXT: [[VAL_FCA_2_LOAD:%.*]] = load i32, i32* [[VAL_FCA_2_GEP]], align 8, !dbg [[DBG150]]
-; DEBUGLOC-NEXT: [[VAL_FCA_2_INSERT:%.*]] = insertvalue [5 x i32] [[VAL_FCA_1_INSERT]], i32 [[VAL_FCA_2_LOAD]], 2, !dbg [[DBG150]]
-; DEBUGLOC-NEXT: [[VAL_FCA_3_GEP:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[PTR]], i32 0, i32 3, !dbg [[DBG150]]
-; DEBUGLOC-NEXT: [[VAL_FCA_3_LOAD:%.*]] = load i32, i32* [[VAL_FCA_3_GEP]], align 4, !dbg [[DBG150]]
-; DEBUGLOC-NEXT: [[VAL_FCA_3_INSERT:%.*]] = insertvalue [5 x i32] [[VAL_FCA_2_INSERT]], i32 [[VAL_FCA_3_LOAD]], 3, !dbg [[DBG150]]
-; DEBUGLOC-NEXT: [[VAL_FCA_4_GEP:%.*]] = getelementptr inbounds [5 x i32], [5 x i32]* [[PTR]], i32 0, i32 4, !dbg [[DBG150]]
-; DEBUGLOC-NEXT: [[VAL_FCA_4_LOAD:%.*]] = load i32, i32* [[VAL_FCA_4_GEP]], align 8, !dbg [[DBG150]]
-; DEBUGLOC-NEXT: [[VAL_FCA_4_INSERT:%.*]] = insertvalue [5 x i32] [[VAL_FCA_3_INSERT]], i32 [[VAL_FCA_4_LOAD]], 4, !dbg [[DBG150]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata [5 x i32] [[VAL_FCA_4_INSERT]], metadata [[META146:![0-9]+]], metadata !DIExpression()), !dbg [[DBG150]]
-; DEBUGLOC-NEXT: ret void, !dbg [[DBG151:![0-9]+]]
-;
+; CHECK: load i32, {{.*}}, align 8
+; CHECK: load i32, {{.*}}, align 4
+; CHECK: load i32, {{.*}}, align 8
+; CHECK: load i32, {{.*}}, align 4
+; CHECK: load i32, {{.*}}, align 8
%ptr = alloca [5 x i32], align 8
%ptr.8 = bitcast [5 x i32]* %ptr to i8*
define void @test10() {
; CHECK-LABEL: @test10(
-; CHECK-NEXT: [[PTR:%.*]] = alloca { i32, i8, i8, { i8, i16 } }, align 2
-; CHECK-NEXT: [[PTR_8:%.*]] = bitcast { i32, i8, i8, { i8, i16 } }* [[PTR]] to i8*
-; CHECK-NEXT: call void @populate(i8* [[PTR_8]])
-; CHECK-NEXT: [[VAL_FCA_0_GEP:%.*]] = getelementptr inbounds { i32, i8, i8, { i8, i16 } }, { i32, i8, i8, { i8, i16 } }* [[PTR]], i32 0, i32 0
-; CHECK-NEXT: [[VAL_FCA_0_LOAD:%.*]] = load i32, i32* [[VAL_FCA_0_GEP]], align 2
-; CHECK-NEXT: [[VAL_FCA_0_INSERT:%.*]] = insertvalue { i32, i8, i8, { i8, i16 } } poison, i32 [[VAL_FCA_0_LOAD]], 0
-; CHECK-NEXT: [[VAL_FCA_1_GEP:%.*]] = getelementptr inbounds { i32, i8, i8, { i8, i16 } }, { i32, i8, i8, { i8, i16 } }* [[PTR]], i32 0, i32 1
-; CHECK-NEXT: [[VAL_FCA_1_LOAD:%.*]] = load i8, i8* [[VAL_FCA_1_GEP]], align 2
-; CHECK-NEXT: [[VAL_FCA_1_INSERT:%.*]] = insertvalue { i32, i8, i8, { i8, i16 } } [[VAL_FCA_0_INSERT]], i8 [[VAL_FCA_1_LOAD]], 1
-; CHECK-NEXT: [[VAL_FCA_2_GEP:%.*]] = getelementptr inbounds { i32, i8, i8, { i8, i16 } }, { i32, i8, i8, { i8, i16 } }* [[PTR]], i32 0, i32 2
-; CHECK-NEXT: [[VAL_FCA_2_LOAD:%.*]] = load i8, i8* [[VAL_FCA_2_GEP]], align 1
-; CHECK-NEXT: [[VAL_FCA_2_INSERT:%.*]] = insertvalue { i32, i8, i8, { i8, i16 } } [[VAL_FCA_1_INSERT]], i8 [[VAL_FCA_2_LOAD]], 2
-; CHECK-NEXT: [[VAL_FCA_3_0_GEP:%.*]] = getelementptr inbounds { i32, i8, i8, { i8, i16 } }, { i32, i8, i8, { i8, i16 } }* [[PTR]], i32 0, i32 3, i32 0
-; CHECK-NEXT: [[VAL_FCA_3_0_LOAD:%.*]] = load i8, i8* [[VAL_FCA_3_0_GEP]], align 2
-; CHECK-NEXT: [[VAL_FCA_3_0_INSERT:%.*]] = insertvalue { i32, i8, i8, { i8, i16 } } [[VAL_FCA_2_INSERT]], i8 [[VAL_FCA_3_0_LOAD]], 3, 0
-; CHECK-NEXT: [[VAL_FCA_3_1_GEP:%.*]] = getelementptr inbounds { i32, i8, i8, { i8, i16 } }, { i32, i8, i8, { i8, i16 } }* [[PTR]], i32 0, i32 3, i32 1
-; CHECK-NEXT: [[VAL_FCA_3_1_LOAD:%.*]] = load i16, i16* [[VAL_FCA_3_1_GEP]], align 2
-; CHECK-NEXT: [[VAL_FCA_3_1_INSERT:%.*]] = insertvalue { i32, i8, i8, { i8, i16 } } [[VAL_FCA_3_0_INSERT]], i16 [[VAL_FCA_3_1_LOAD]], 3, 1
-; CHECK-NEXT: ret void
-;
-; DEBUGLOC-LABEL: @test10(
-; DEBUGLOC-NEXT: [[PTR:%.*]] = alloca { i32, i8, i8, { i8, i16 } }, align 2, !dbg [[DBG158:![0-9]+]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata { i32, i8, i8, { i8, i16 } }* [[PTR]], metadata [[META154:![0-9]+]], metadata !DIExpression()), !dbg [[DBG158]]
-; DEBUGLOC-NEXT: [[PTR_8:%.*]] = bitcast { i32, i8, i8, { i8, i16 } }* [[PTR]] to i8*, !dbg [[DBG159:![0-9]+]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata i8* [[PTR_8]], metadata [[META155:![0-9]+]], metadata !DIExpression()), !dbg [[DBG159]]
-; DEBUGLOC-NEXT: call void @populate(i8* [[PTR_8]]), !dbg [[DBG160:![0-9]+]]
-; DEBUGLOC-NEXT: [[VAL_FCA_0_GEP:%.*]] = getelementptr inbounds { i32, i8, i8, { i8, i16 } }, { i32, i8, i8, { i8, i16 } }* [[PTR]], i32 0, i32 0, !dbg [[DBG161:![0-9]+]]
-; DEBUGLOC-NEXT: [[VAL_FCA_0_LOAD:%.*]] = load i32, i32* [[VAL_FCA_0_GEP]], align 2, !dbg [[DBG161]]
-; DEBUGLOC-NEXT: [[VAL_FCA_0_INSERT:%.*]] = insertvalue { i32, i8, i8, { i8, i16 } } poison, i32 [[VAL_FCA_0_LOAD]], 0, !dbg [[DBG161]]
-; DEBUGLOC-NEXT: [[VAL_FCA_1_GEP:%.*]] = getelementptr inbounds { i32, i8, i8, { i8, i16 } }, { i32, i8, i8, { i8, i16 } }* [[PTR]], i32 0, i32 1, !dbg [[DBG161]]
-; DEBUGLOC-NEXT: [[VAL_FCA_1_LOAD:%.*]] = load i8, i8* [[VAL_FCA_1_GEP]], align 2, !dbg [[DBG161]]
-; DEBUGLOC-NEXT: [[VAL_FCA_1_INSERT:%.*]] = insertvalue { i32, i8, i8, { i8, i16 } } [[VAL_FCA_0_INSERT]], i8 [[VAL_FCA_1_LOAD]], 1, !dbg [[DBG161]]
-; DEBUGLOC-NEXT: [[VAL_FCA_2_GEP:%.*]] = getelementptr inbounds { i32, i8, i8, { i8, i16 } }, { i32, i8, i8, { i8, i16 } }* [[PTR]], i32 0, i32 2, !dbg [[DBG161]]
-; DEBUGLOC-NEXT: [[VAL_FCA_2_LOAD:%.*]] = load i8, i8* [[VAL_FCA_2_GEP]], align 1, !dbg [[DBG161]]
-; DEBUGLOC-NEXT: [[VAL_FCA_2_INSERT:%.*]] = insertvalue { i32, i8, i8, { i8, i16 } } [[VAL_FCA_1_INSERT]], i8 [[VAL_FCA_2_LOAD]], 2, !dbg [[DBG161]]
-; DEBUGLOC-NEXT: [[VAL_FCA_3_0_GEP:%.*]] = getelementptr inbounds { i32, i8, i8, { i8, i16 } }, { i32, i8, i8, { i8, i16 } }* [[PTR]], i32 0, i32 3, i32 0, !dbg [[DBG161]]
-; DEBUGLOC-NEXT: [[VAL_FCA_3_0_LOAD:%.*]] = load i8, i8* [[VAL_FCA_3_0_GEP]], align 2, !dbg [[DBG161]]
-; DEBUGLOC-NEXT: [[VAL_FCA_3_0_INSERT:%.*]] = insertvalue { i32, i8, i8, { i8, i16 } } [[VAL_FCA_2_INSERT]], i8 [[VAL_FCA_3_0_LOAD]], 3, 0, !dbg [[DBG161]]
-; DEBUGLOC-NEXT: [[VAL_FCA_3_1_GEP:%.*]] = getelementptr inbounds { i32, i8, i8, { i8, i16 } }, { i32, i8, i8, { i8, i16 } }* [[PTR]], i32 0, i32 3, i32 1, !dbg [[DBG161]]
-; DEBUGLOC-NEXT: [[VAL_FCA_3_1_LOAD:%.*]] = load i16, i16* [[VAL_FCA_3_1_GEP]], align 2, !dbg [[DBG161]]
-; DEBUGLOC-NEXT: [[VAL_FCA_3_1_INSERT:%.*]] = insertvalue { i32, i8, i8, { i8, i16 } } [[VAL_FCA_3_0_INSERT]], i16 [[VAL_FCA_3_1_LOAD]], 3, 1, !dbg [[DBG161]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata { i32, i8, i8, { i8, i16 } } [[VAL_FCA_3_1_INSERT]], metadata [[META156:![0-9]+]], metadata !DIExpression()), !dbg [[DBG161]]
-; DEBUGLOC-NEXT: ret void, !dbg [[DBG162:![0-9]+]]
-;
+; CHECK: load i32, {{.*}}, align 2
+; CHECK: load i8, {{.*}}, align 2
+; CHECK: load i8, {{.*}}, align 1
+; CHECK: load i8, {{.*}}, align 2
+; CHECK: load i16, {{.*}}, align 2
%ptr = alloca {i32, i8, i8, {i8, i16}}, align 2
%ptr.8 = bitcast {i32, i8, i8, {i8, i16}}* %ptr to i8*
%struct = type { i32, i32 }
define dso_local i32 @pr45010(%struct* %A) {
-; CHECK-LABEL: @pr45010(
-; CHECK-NEXT: [[B_SROA_0:%.*]] = alloca i32, align 4
-; CHECK-NEXT: [[A_I:%.*]] = getelementptr inbounds [[STRUCT:%.*]], %struct* [[A:%.*]], i32 0, i32 0
-; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[A_I]], align 4
-; CHECK-NEXT: store atomic volatile i32 [[TMP1]], i32* [[B_SROA_0]] release, align 4
-; CHECK-NEXT: [[B_SROA_0_0_B_SROA_0_0_X:%.*]] = load atomic volatile i32, i32* [[B_SROA_0]] acquire, align 4
-; CHECK-NEXT: ret i32 [[B_SROA_0_0_B_SROA_0_0_X]]
-;
-; DEBUGLOC-LABEL: @pr45010(
-; DEBUGLOC-NEXT: [[B_SROA_0:%.*]] = alloca i32, align 4, !dbg [[DBG172:![0-9]+]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata %struct* undef, metadata [[META165:![0-9]+]], metadata !DIExpression()), !dbg [[DBG172]]
-; DEBUGLOC-NEXT: [[A_I:%.*]] = getelementptr inbounds [[STRUCT:%.*]], %struct* [[A:%.*]], i32 0, i32 0, !dbg [[DBG173:![0-9]+]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata i32* [[A_I]], metadata [[META166:![0-9]+]], metadata !DIExpression()), !dbg [[DBG173]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata i32* undef, metadata [[META167:![0-9]+]], metadata !DIExpression()), !dbg [[DBG174:![0-9]+]]
-; DEBUGLOC-NEXT: [[TMP1:%.*]] = load i32, i32* [[A_I]], align 4, !dbg [[DBG175:![0-9]+]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata i32 [[TMP1]], metadata [[META168:![0-9]+]], metadata !DIExpression()), !dbg [[DBG175]]
-; DEBUGLOC-NEXT: store atomic volatile i32 [[TMP1]], i32* [[B_SROA_0]] release, align 4, !dbg [[DBG176:![0-9]+]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata i32* undef, metadata [[META170:![0-9]+]], metadata !DIExpression()), !dbg [[DBG177:![0-9]+]]
-; DEBUGLOC-NEXT: [[B_SROA_0_0_B_SROA_0_0_X:%.*]] = load atomic volatile i32, i32* [[B_SROA_0]] acquire, align 4, !dbg [[DBG178:![0-9]+]]
-; DEBUGLOC-NEXT: call void @llvm.dbg.value(metadata i32 [[B_SROA_0_0_B_SROA_0_0_X]], metadata [[META171:![0-9]+]], metadata !DIExpression()), !dbg [[DBG178]]
-; DEBUGLOC-NEXT: ret i32 [[B_SROA_0_0_B_SROA_0_0_X]], !dbg [[DBG179:![0-9]+]]
-;
+; CHECK-LABEL: @pr45010
+; CHECK: load atomic volatile i32, {{.*}}, align 4
%B = alloca %struct, align 4
%A.i = getelementptr inbounds %struct, %struct* %A, i32 0, i32 0
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -passes=sroa -S | FileCheck %s
target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
; ordering.
;
; CHECK-LABEL: @test1(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[A_SROA_3_0_INSERT_EXT:%.*]] = zext i8 0 to i24
-; CHECK-NEXT: [[A_SROA_3_0_INSERT_MASK:%.*]] = and i24 undef, -256
-; CHECK-NEXT: [[A_SROA_3_0_INSERT_INSERT:%.*]] = or i24 [[A_SROA_3_0_INSERT_MASK]], [[A_SROA_3_0_INSERT_EXT]]
-; CHECK-NEXT: [[A_SROA_2_0_INSERT_EXT:%.*]] = zext i8 0 to i24
-; CHECK-NEXT: [[A_SROA_2_0_INSERT_SHIFT:%.*]] = shl i24 [[A_SROA_2_0_INSERT_EXT]], 8
-; CHECK-NEXT: [[A_SROA_2_0_INSERT_MASK:%.*]] = and i24 [[A_SROA_3_0_INSERT_INSERT]], -65281
-; CHECK-NEXT: [[A_SROA_2_0_INSERT_INSERT:%.*]] = or i24 [[A_SROA_2_0_INSERT_MASK]], [[A_SROA_2_0_INSERT_SHIFT]]
-; CHECK-NEXT: [[A_SROA_0_0_INSERT_EXT:%.*]] = zext i8 0 to i24
-; CHECK-NEXT: [[A_SROA_0_0_INSERT_SHIFT:%.*]] = shl i24 [[A_SROA_0_0_INSERT_EXT]], 16
-; CHECK-NEXT: [[A_SROA_0_0_INSERT_MASK:%.*]] = and i24 [[A_SROA_2_0_INSERT_INSERT]], 65535
-; CHECK-NEXT: [[A_SROA_0_0_INSERT_INSERT:%.*]] = or i24 [[A_SROA_0_0_INSERT_MASK]], [[A_SROA_0_0_INSERT_SHIFT]]
-; CHECK-NEXT: [[B_SROA_0_0_EXTRACT_SHIFT:%.*]] = lshr i24 [[A_SROA_0_0_INSERT_INSERT]], 16
-; CHECK-NEXT: [[B_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i24 [[B_SROA_0_0_EXTRACT_SHIFT]] to i8
-; CHECK-NEXT: [[B_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i24 [[A_SROA_0_0_INSERT_INSERT]], 8
-; CHECK-NEXT: [[B_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i24 [[B_SROA_2_0_EXTRACT_SHIFT]] to i8
-; CHECK-NEXT: [[B_SROA_3_0_EXTRACT_TRUNC:%.*]] = trunc i24 [[A_SROA_0_0_INSERT_INSERT]] to i8
-; CHECK-NEXT: [[BSUM0:%.*]] = add i8 [[B_SROA_0_0_EXTRACT_TRUNC]], [[B_SROA_2_0_EXTRACT_TRUNC]]
-; CHECK-NEXT: [[BSUM1:%.*]] = add i8 [[BSUM0]], [[B_SROA_3_0_EXTRACT_TRUNC]]
-; CHECK-NEXT: ret i8 [[BSUM1]]
-;
entry:
%a = alloca [3 x i8]
%b = alloca [3 x i8]
+; CHECK-NOT: alloca
%a0ptr = getelementptr [3 x i8], [3 x i8]* %a, i64 0, i32 0
store i8 0, i8* %a0ptr
store i8 0, i8* %a2ptr
%aiptr = bitcast [3 x i8]* %a to i24*
%ai = load i24, i24* %aiptr
+; CHECK-NOT: store
+; CHECK-NOT: load
+; CHECK: %[[ext2:.*]] = zext i8 0 to i24
+; CHECK-NEXT: %[[mask2:.*]] = and i24 undef, -256
+; CHECK-NEXT: %[[insert2:.*]] = or i24 %[[mask2]], %[[ext2]]
+; CHECK-NEXT: %[[ext1:.*]] = zext i8 0 to i24
+; CHECK-NEXT: %[[shift1:.*]] = shl i24 %[[ext1]], 8
+; CHECK-NEXT: %[[mask1:.*]] = and i24 %[[insert2]], -65281
+; CHECK-NEXT: %[[insert1:.*]] = or i24 %[[mask1]], %[[shift1]]
+; CHECK-NEXT: %[[ext0:.*]] = zext i8 0 to i24
+; CHECK-NEXT: %[[shift0:.*]] = shl i24 %[[ext0]], 16
+; CHECK-NEXT: %[[mask0:.*]] = and i24 %[[insert1]], 65535
+; CHECK-NEXT: %[[insert0:.*]] = or i24 %[[mask0]], %[[shift0]]
%biptr = bitcast [3 x i8]* %b to i24*
store i24 %ai, i24* %biptr
%b1 = load i8, i8* %b1ptr
%b2ptr = getelementptr [3 x i8], [3 x i8]* %b, i64 0, i32 2
%b2 = load i8, i8* %b2ptr
+; CHECK-NOT: store
+; CHECK-NOT: load
+; CHECK: %[[shift0:.*]] = lshr i24 %[[insert0]], 16
+; CHECK-NEXT: %[[trunc0:.*]] = trunc i24 %[[shift0]] to i8
+; CHECK-NEXT: %[[shift1:.*]] = lshr i24 %[[insert0]], 8
+; CHECK-NEXT: %[[trunc1:.*]] = trunc i24 %[[shift1]] to i8
+; CHECK-NEXT: %[[trunc2:.*]] = trunc i24 %[[insert0]] to i8
%bsum0 = add i8 %b0, %b1
%bsum1 = add i8 %bsum0, %b2
ret i8 %bsum1
+; CHECK: %[[sum0:.*]] = add i8 %[[trunc0]], %[[trunc1]]
+; CHECK-NEXT: %[[sum1:.*]] = add i8 %[[sum0]], %[[trunc2]]
+; CHECK-NEXT: ret i8 %[[sum1]]
}
define i64 @test2() {
; promoted.
;
; CHECK-LABEL: @test2(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[A_SROA_2_SROA_4_0_INSERT_EXT:%.*]] = zext i8 1 to i40
-; CHECK-NEXT: [[A_SROA_2_SROA_4_0_INSERT_MASK:%.*]] = and i40 undef, -256
-; CHECK-NEXT: [[A_SROA_2_SROA_4_0_INSERT_INSERT:%.*]] = or i40 [[A_SROA_2_SROA_4_0_INSERT_MASK]], [[A_SROA_2_SROA_4_0_INSERT_EXT]]
-; CHECK-NEXT: [[A_SROA_2_SROA_3_0_INSERT_EXT:%.*]] = zext i24 0 to i40
-; CHECK-NEXT: [[A_SROA_2_SROA_3_0_INSERT_SHIFT:%.*]] = shl i40 [[A_SROA_2_SROA_3_0_INSERT_EXT]], 8
-; CHECK-NEXT: [[A_SROA_2_SROA_3_0_INSERT_MASK:%.*]] = and i40 [[A_SROA_2_SROA_4_0_INSERT_INSERT]], -4294967041
-; CHECK-NEXT: [[A_SROA_2_SROA_3_0_INSERT_INSERT:%.*]] = or i40 [[A_SROA_2_SROA_3_0_INSERT_MASK]], [[A_SROA_2_SROA_3_0_INSERT_SHIFT]]
-; CHECK-NEXT: [[A_SROA_2_SROA_0_0_INSERT_EXT:%.*]] = zext i8 0 to i40
-; CHECK-NEXT: [[A_SROA_2_SROA_0_0_INSERT_SHIFT:%.*]] = shl i40 [[A_SROA_2_SROA_0_0_INSERT_EXT]], 32
-; CHECK-NEXT: [[A_SROA_2_SROA_0_0_INSERT_MASK:%.*]] = and i40 [[A_SROA_2_SROA_3_0_INSERT_INSERT]], 4294967295
-; CHECK-NEXT: [[A_SROA_2_SROA_0_0_INSERT_INSERT:%.*]] = or i40 [[A_SROA_2_SROA_0_0_INSERT_MASK]], [[A_SROA_2_SROA_0_0_INSERT_SHIFT]]
-; CHECK-NEXT: [[A_SROA_2_0_INSERT_EXT:%.*]] = zext i40 [[A_SROA_2_SROA_0_0_INSERT_INSERT]] to i56
-; CHECK-NEXT: [[A_SROA_2_0_INSERT_MASK:%.*]] = and i56 undef, -1099511627776
-; CHECK-NEXT: [[A_SROA_2_0_INSERT_INSERT:%.*]] = or i56 [[A_SROA_2_0_INSERT_MASK]], [[A_SROA_2_0_INSERT_EXT]]
-; CHECK-NEXT: [[A_SROA_0_0_INSERT_EXT:%.*]] = zext i16 1 to i56
-; CHECK-NEXT: [[A_SROA_0_0_INSERT_SHIFT:%.*]] = shl i56 [[A_SROA_0_0_INSERT_EXT]], 40
-; CHECK-NEXT: [[A_SROA_0_0_INSERT_MASK:%.*]] = and i56 [[A_SROA_2_0_INSERT_INSERT]], 1099511627775
-; CHECK-NEXT: [[A_SROA_0_0_INSERT_INSERT:%.*]] = or i56 [[A_SROA_0_0_INSERT_MASK]], [[A_SROA_0_0_INSERT_SHIFT]]
-; CHECK-NEXT: [[RET:%.*]] = zext i56 [[A_SROA_0_0_INSERT_INSERT]] to i64
-; CHECK-NEXT: ret i64 [[RET]]
-;
entry:
%a = alloca [7 x i8]
+; CHECK-NOT: alloca
%a0ptr = getelementptr [7 x i8], [7 x i8]* %a, i64 0, i32 0
%a1ptr = getelementptr [7 x i8], [7 x i8]* %a, i64 0, i32 1
%a2ptr = getelementptr [7 x i8], [7 x i8]* %a, i64 0, i32 2
%a3ptr = getelementptr [7 x i8], [7 x i8]* %a, i64 0, i32 3
+; CHECK-NOT: store
+; CHECK-NOT: load
%a0i16ptr = bitcast i8* %a0ptr to i16*
store i16 1, i16* %a0i16ptr
; the alloca is splitted into multiple slices
; Here, i8 1 is for %a[6]
+; CHECK: %[[ext1:.*]] = zext i8 1 to i40
+; CHECK-NEXT: %[[mask1:.*]] = and i40 undef, -256
+; CHECK-NEXT: %[[insert1:.*]] = or i40 %[[mask1]], %[[ext1]]
; Here, i24 0 is for %a[3] to %a[5]
+; CHECK-NEXT: %[[ext2:.*]] = zext i24 0 to i40
+; CHECK-NEXT: %[[shift2:.*]] = shl i40 %[[ext2]], 8
+; CHECK-NEXT: %[[mask2:.*]] = and i40 %[[insert1]], -4294967041
+; CHECK-NEXT: %[[insert2:.*]] = or i40 %[[mask2]], %[[shift2]]
; Here, i8 0 is for %a[2]
+; CHECK-NEXT: %[[ext3:.*]] = zext i8 0 to i40
+; CHECK-NEXT: %[[shift3:.*]] = shl i40 %[[ext3]], 32
+; CHECK-NEXT: %[[mask3:.*]] = and i40 %[[insert2]], 4294967295
+; CHECK-NEXT: %[[insert3:.*]] = or i40 %[[mask3]], %[[shift3]]
+; CHECK-NEXT: %[[ext4:.*]] = zext i40 %[[insert3]] to i56
+; CHECK-NEXT: %[[mask4:.*]] = and i56 undef, -1099511627776
+; CHECK-NEXT: %[[insert4:.*]] = or i56 %[[mask4]], %[[ext4]]
+; CHECK-NOT: store
+; CHECK-NOT: load
%aiptr = bitcast [7 x i8]* %a to i56*
%ai = load i56, i56* %aiptr
%ret = zext i56 %ai to i64
ret i64 %ret
; Here, i16 1 is for %a[0] to %a[1]
+; CHECK-NEXT: %[[ext5:.*]] = zext i16 1 to i56
+; CHECK-NEXT: %[[shift5:.*]] = shl i56 %[[ext5]], 40
+; CHECK-NEXT: %[[mask5:.*]] = and i56 %[[insert4]], 1099511627775
+; CHECK-NEXT: %[[insert5:.*]] = or i56 %[[mask5]], %[[shift5]]
+; CHECK-NEXT: %[[ret:.*]] = zext i56 %[[insert5]] to i64
+; CHECK-NEXT: ret i64 %[[ret]]
}
define i64 @PR14132(i1 %flag) {
; CHECK-LABEL: @PR14132(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: br i1 [[FLAG:%.*]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
-; CHECK: if.then:
-; CHECK-NEXT: [[B_0_LOAD_EXT:%.*]] = zext i8 1 to i64
-; CHECK-NEXT: [[B_0_ENDIAN_SHIFT:%.*]] = shl i64 [[B_0_LOAD_EXT]], 56
-; CHECK-NEXT: br label [[IF_END]]
-; CHECK: if.end:
-; CHECK-NEXT: [[PTR_0_SROA_SPECULATED:%.*]] = phi i64 [ [[B_0_ENDIAN_SHIFT]], [[IF_THEN]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: ret i64 [[PTR_0_SROA_SPECULATED]]
-;
; Here we form a PHI-node by promoting the pointer alloca first, and then in
; order to promote the other two allocas, we speculate the load of the
; now-phi-node-pointer. In doing so we end up loading a 64-bit value from an i8
%a = alloca i64, align 8
%b = alloca i8, align 8
%ptr = alloca i64*, align 8
+; CHECK-NOT: alloca
%ptr.cast = bitcast i64** %ptr to i8**
store i64 0, i64* %a
if.then:
store i8* %b, i8** %ptr.cast
br label %if.end
+; CHECK-NOT: store
+; CHECK: %[[ext:.*]] = zext i8 1 to i64
+; CHECK: %[[shift:.*]] = shl i64 %[[ext]], 56
if.end:
%tmp = load i64*, i64** %ptr
%result = load i64, i64* %tmp
+; CHECK-NOT: load
+; CHECK: %[[result:.*]] = phi i64 [ %[[shift]], %if.then ], [ 0, %entry ]
ret i64 %result
+; CHECK-NEXT: ret i64 %[[result]]
}
declare void @f(i64 %x, i32 %y)
define void @test3() {
; CHECK-LABEL: @test3(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[A_SROA_3_0_INSERT_EXT:%.*]] = zext i32 134316040 to i64
-; CHECK-NEXT: [[A_SROA_3_0_INSERT_MASK:%.*]] = and i64 undef, -4294967296
-; CHECK-NEXT: [[A_SROA_3_0_INSERT_INSERT:%.*]] = or i64 [[A_SROA_3_0_INSERT_MASK]], [[A_SROA_3_0_INSERT_EXT]]
-; CHECK-NEXT: [[A_SROA_0_0_INSERT_EXT:%.*]] = zext i32 8 to i64
-; CHECK-NEXT: [[A_SROA_0_0_INSERT_SHIFT:%.*]] = shl i64 [[A_SROA_0_0_INSERT_EXT]], 32
-; CHECK-NEXT: [[A_SROA_0_0_INSERT_MASK:%.*]] = and i64 [[A_SROA_3_0_INSERT_INSERT]], 4294967295
-; CHECK-NEXT: [[A_SROA_0_0_INSERT_INSERT:%.*]] = or i64 [[A_SROA_0_0_INSERT_MASK]], [[A_SROA_0_0_INSERT_SHIFT]]
-; CHECK-NEXT: call void @f(i64 [[A_SROA_0_0_INSERT_INSERT]], i32 8)
-; CHECK-NEXT: ret void
;
; This is a test that specifically exercises the big-endian lowering because it
; ends up splitting a 64-bit integer into two smaller integers and has a number
; would miscompile this by either dropping a most significant byte or least
; significant byte due to shrinking the [4,8) slice to an i24, or by failing to
; move the bytes around correctly.
+;
; The magical number 34494054408 is used because it has bits set in various
; bytes so that it is clear if those bytes fail to be propagated.
+;
; If you're debugging this, rather than using the direct magical numbers, run
; the IR through '-sroa -instcombine'. With '-instcombine' these will be
; constant folded, and if the i64 doesn't round-trip correctly, you've found
; a bug!
+;
entry:
%a = alloca { i32, i24 }, align 4
+; CHECK-NOT: alloca
%tmp0 = bitcast { i32, i24 }* %a to i64*
store i64 34494054408, i64* %tmp0
%tmp1 = load i64, i64* %tmp0, align 4
%tmp2 = bitcast { i32, i24 }* %a to i32*
%tmp3 = load i32, i32* %tmp2, align 4
+; CHECK: %[[HI_EXT:.*]] = zext i32 134316040 to i64
+; CHECK: %[[HI_INPUT:.*]] = and i64 undef, -4294967296
+; CHECK: %[[HI_MERGE:.*]] = or i64 %[[HI_INPUT]], %[[HI_EXT]]
+; CHECK: %[[LO_EXT:.*]] = zext i32 8 to i64
+; CHECK: %[[LO_SHL:.*]] = shl i64 %[[LO_EXT]], 32
+; CHECK: %[[LO_INPUT:.*]] = and i64 %[[HI_MERGE]], 4294967295
+; CHECK: %[[LO_MERGE:.*]] = or i64 %[[LO_INPUT]], %[[LO_SHL]]
call void @f(i64 %tmp1, i32 %tmp3)
+; CHECK: call void @f(i64 %[[LO_MERGE]], i32 8)
ret void
+; CHECK: ret void
}
define void @test4() {
-; CHECK-LABEL: @test4(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[A_SROA_0_0_EXTRACT_SHIFT:%.*]] = lshr i64 34494054408, 32
-; CHECK-NEXT: [[A_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[A_SROA_0_0_EXTRACT_SHIFT]] to i32
-; CHECK-NEXT: [[A_SROA_3_0_EXTRACT_TRUNC:%.*]] = trunc i64 34494054408 to i32
-; CHECK-NEXT: [[A_SROA_3_0_INSERT_EXT:%.*]] = zext i32 [[A_SROA_3_0_EXTRACT_TRUNC]] to i64
-; CHECK-NEXT: [[A_SROA_3_0_INSERT_MASK:%.*]] = and i64 undef, -4294967296
-; CHECK-NEXT: [[A_SROA_3_0_INSERT_INSERT:%.*]] = or i64 [[A_SROA_3_0_INSERT_MASK]], [[A_SROA_3_0_INSERT_EXT]]
-; CHECK-NEXT: [[A_SROA_0_0_INSERT_EXT:%.*]] = zext i32 [[A_SROA_0_0_EXTRACT_TRUNC]] to i64
-; CHECK-NEXT: [[A_SROA_0_0_INSERT_SHIFT:%.*]] = shl i64 [[A_SROA_0_0_INSERT_EXT]], 32
-; CHECK-NEXT: [[A_SROA_0_0_INSERT_MASK:%.*]] = and i64 [[A_SROA_3_0_INSERT_INSERT]], 4294967295
-; CHECK-NEXT: [[A_SROA_0_0_INSERT_INSERT:%.*]] = or i64 [[A_SROA_0_0_INSERT_MASK]], [[A_SROA_0_0_INSERT_SHIFT]]
-; CHECK-NEXT: call void @f(i64 [[A_SROA_0_0_INSERT_INSERT]], i32 [[A_SROA_0_0_EXTRACT_TRUNC]])
-; CHECK-NEXT: ret void
+; CHECK-LABEL: @test4
;
; Much like @test3, this is specifically testing big-endian management of data.
; Also similarly, it uses constants with particular bits set to help track
; whether values are corrupted, and can be easily evaluated by running through
; -instcombine to see that the i64 round-trips.
+;
entry:
%a = alloca { i32, i24 }, align 4
%a2 = alloca i64, align 4
+; CHECK-NOT: alloca
store i64 34494054408, i64* %a2
%tmp0 = bitcast { i32, i24 }* %a to i8*
%tmp1 = bitcast i64* %a2 to i8*
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %tmp0, i8* align 4 %tmp1, i64 8, i1 false)
+; CHECK: %[[LO_SHR:.*]] = lshr i64 34494054408, 32
+; CHECK: %[[LO_START:.*]] = trunc i64 %[[LO_SHR]] to i32
+; CHECK: %[[HI_START:.*]] = trunc i64 34494054408 to i32
%tmp2 = bitcast { i32, i24 }* %a to i64*
%tmp3 = load i64, i64* %tmp2, align 4
%tmp4 = bitcast { i32, i24 }* %a to i32*
%tmp5 = load i32, i32* %tmp4, align 4
+; CHECK: %[[HI_EXT:.*]] = zext i32 %[[HI_START]] to i64
+; CHECK: %[[HI_INPUT:.*]] = and i64 undef, -4294967296
+; CHECK: %[[HI_MERGE:.*]] = or i64 %[[HI_INPUT]], %[[HI_EXT]]
+; CHECK: %[[LO_EXT:.*]] = zext i32 %[[LO_START]] to i64
+; CHECK: %[[LO_SHL:.*]] = shl i64 %[[LO_EXT]], 32
+; CHECK: %[[LO_INPUT:.*]] = and i64 %[[HI_MERGE]], 4294967295
+; CHECK: %[[LO_MERGE:.*]] = or i64 %[[LO_INPUT]], %[[LO_SHL]]
call void @f(i64 %tmp3, i32 %tmp5)
+; CHECK: call void @f(i64 %[[LO_MERGE]], i32 %[[LO_START]])
ret void
+; CHECK: ret void
}
declare void @llvm.memcpy.p0i8.p0i8.i64(i8*, i8*, i64, i1)
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; Test that SROA can deal with allocas that have more than one
; dbg.declare hanging off of it.
; Function Attrs: noinline optnone ssp uwtable
define i64 @_Z1g4pair(i64 %p.coerce0, i64 %p.coerce1) #0 !dbg !8 {
-; CHECK-LABEL: @_Z1g4pair(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: call void @llvm.dbg.value(metadata i64 [[P_COERCE0:%.*]], metadata [[META16:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_fragment, 0, 64)), !dbg [[DBG17:![0-9]+]]
-; CHECK-NEXT: call void @llvm.dbg.value(metadata i64 [[P_COERCE0]], metadata [[META18:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_fragment, 0, 64)), !dbg [[DBG20:![0-9]+]]
-; CHECK-NEXT: call void @llvm.dbg.value(metadata i64 [[P_COERCE1:%.*]], metadata [[META16]], metadata !DIExpression(DW_OP_LLVM_fragment, 64, 64)), !dbg [[DBG17]]
-; CHECK-NEXT: call void @llvm.dbg.value(metadata i64 [[P_COERCE1]], metadata [[META18]], metadata !DIExpression(DW_OP_LLVM_fragment, 64, 64)), !dbg [[DBG20]]
-; CHECK-NEXT: ret i64 [[P_COERCE0]], !dbg [[DBG22:![0-9]+]]
-;
entry:
%p = alloca %struct.pair, align 8
%0 = getelementptr inbounds %struct.pair, %struct.pair* %p, i32 0, i32 0
store i64 %p.coerce0, i64* %0, align 8
%1 = getelementptr inbounds %struct.pair, %struct.pair* %p, i32 0, i32 1
store i64 %p.coerce1, i64* %1, align 8
+ ; CHECK-DAG: call void @llvm.dbg.value(metadata i64 %p.coerce0, metadata ![[VAR:[0-9]+]], metadata !DIExpression(DW_OP_LLVM_fragment, 0, 64)), !dbg ![[LOC:[0-9]+]]
+ ; CHECK-DAG: call void @llvm.dbg.value(metadata i64 %p.coerce1, metadata ![[VAR]], metadata !DIExpression(DW_OP_LLVM_fragment, 64, 64)), !dbg ![[LOC]]
+ ; CHECK-DAG: call void @llvm.dbg.value(metadata i64 %p.coerce0, metadata ![[INLINED_VAR:[0-9]+]], metadata !DIExpression(DW_OP_LLVM_fragment, 0, 64)), !dbg ![[INLINED_LOC:[0-9]+]]
+ ; CHECK-DAG: call void @llvm.dbg.value(metadata i64 %p.coerce1, metadata ![[INLINED_VAR]], metadata !DIExpression(DW_OP_LLVM_fragment, 64, 64)), !dbg ![[INLINED_LOC]]
call void @llvm.dbg.declare(metadata %struct.pair* %p, metadata !17, metadata !DIExpression()), !dbg !18
call void @llvm.dbg.declare(metadata %struct.pair* %p, metadata !21, metadata !DIExpression()), !dbg !23
%a.i = getelementptr inbounds %struct.pair, %struct.pair* %p, i32 0, i32 0, !dbg !25
!15 = !DIDerivedType(tag: DW_TAG_member, name: "a", scope: !13, file: !9, line: 1, baseType: !12, size: 64)
!16 = !DIDerivedType(tag: DW_TAG_member, name: "b", scope: !13, file: !9, line: 1, baseType: !12, size: 64, offset: 64)
!17 = !DILocalVariable(name: "p", arg: 1, scope: !8, file: !9, line: 9, type: !13)
+; CHECK: ![[LOC]] = !DILocation
; CHECK-NOT: inlinedAt
; CHECK: =
!18 = !DILocation(line: 9, column: 27, scope: !8)
!20 = !DILocation(line: 10, column: 10, scope: !8)
!21 = !DILocalVariable(name: "p", arg: 1, scope: !22, file: !9, line: 5, type: !13)
!22 = distinct !DISubprogram(name: "f", linkageName: "_ZL1f4pair", scope: !9, file: !9, line: 5, type: !10, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !0, retainedNodes: !2)
+; CHECK: ![[INLINED_LOC]] = !DILocation({{.*}}inlinedAt
!23 = !DILocation(line: 5, column: 27, scope: !22, inlinedAt: !24)
!24 = distinct !DILocation(line: 10, column: 10, scope: !8)
!25 = !DILocation(line: 6, column: 12, scope: !22, inlinedAt: !24)
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -passes=sroa %s -S | FileCheck %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
declare void @llvm.dbg.declare(metadata, metadata, metadata) #0
define void @_ZL18findInsertLocationPN4llvm17MachineBasicBlockENS_9SlotIndexERNS_13LiveIntervalsE() {
-; CHECK-LABEL: @_ZL18findInsertLocationPN4llvm17MachineBasicBlockENS_9SlotIndexERNS_13LiveIntervalsE(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: call void @llvm.dbg.value(metadata %foo* undef, metadata [[META3:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_fragment, 64, 64)), !dbg [[DBG8:![0-9]+]]
-; CHECK-NEXT: ret void
-;
entry:
%retval = alloca %foo, align 8
call void @llvm.dbg.declare(metadata %foo* %retval, metadata !1, metadata !7), !dbg !8
; Checks that SROA still inserts a bit_piece expression, even if it produces only one piece
; (as long as that piece is smaller than the whole thing)
+; CHECK-NOT: call void @llvm.dbg.value
+; CHECK: call void @llvm.dbg.value(metadata %foo* undef, {{.*}}, metadata !DIExpression(DW_OP_LLVM_fragment, 64, 64)), !dbg
+; CHECK-NOT: call void @llvm.dbg.value
%0 = bitcast %foo* %retval to i8*
%1 = getelementptr inbounds i8, i8* %0, i64 8
%2 = bitcast i8* %1 to %foo**
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; SROA fails to rewrite allocs but does rewrite some phis and delete
; dead instructions. Ensure that this invalidates analyses required
; for other passes.
store i64 0, i64* %.sroa.0, align 8
%4 = extractvalue [2 x i64] %1, 1
switch i64 %4, label %6 [
- i64 4, label %foo
- i64 5, label %5
+ i64 4, label %foo
+ i64 5, label %5
]
; <label>:5:
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -passes=sroa -S | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
define { i32, i32 } @test0(i32 %x, i32 %y) {
; CHECK-LABEL: @test0(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[RESULT_FCA_0_INSERT:%.*]] = insertvalue { i32, i32 } poison, i32 [[X:%.*]], 0
-; CHECK-NEXT: [[RESULT_FCA_1_INSERT:%.*]] = insertvalue { i32, i32 } [[RESULT_FCA_0_INSERT]], i32 [[Y:%.*]], 1
-; CHECK-NEXT: ret { i32, i32 } [[RESULT_FCA_1_INSERT]]
-;
+; CHECK-NOT: alloca
+; CHECK: insertvalue { i32, i32 }
+; CHECK: insertvalue { i32, i32 }
+; CHECK: ret { i32, i32 }
entry:
%a = alloca { i32, i32 }
; split the volatile load and store here but must produce volatile scalar loads
; and stores from them.
; CHECK-LABEL: @test1(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[A:%.*]] = alloca { i32, i32 }, align 8
-; CHECK-NEXT: [[B:%.*]] = alloca { i32, i32 }, align 8
-; CHECK-NEXT: [[A_0_GEP1_SROA_IDX:%.*]] = getelementptr inbounds { i32, i32 }, { i32, i32 }* [[A]], i64 0, i32 0
-; CHECK-NEXT: store i32 [[X:%.*]], i32* [[A_0_GEP1_SROA_IDX]], align 8
-; CHECK-NEXT: [[A_4_GEP2_SROA_IDX:%.*]] = getelementptr inbounds { i32, i32 }, { i32, i32 }* [[A]], i64 0, i32 1
-; CHECK-NEXT: store i32 [[Y:%.*]], i32* [[A_4_GEP2_SROA_IDX]], align 4
-; CHECK-NEXT: [[A_0_RESULT:%.*]] = load volatile { i32, i32 }, { i32, i32 }* [[A]], align 8
-; CHECK-NEXT: store volatile { i32, i32 } [[A_0_RESULT]], { i32, i32 }* [[B]], align 8
-; CHECK-NEXT: ret { i32, i32 } [[A_0_RESULT]]
-;
+; CHECK: alloca
+; CHECK: alloca
+; CHECK: load volatile { i32, i32 }, { i32, i32 }*
+; CHECK: store volatile { i32, i32 }
+; CHECK: ret { i32, i32 }
entry:
%a = alloca { i32, i32 }
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -passes=sroa -S | FileCheck %s
;
; Make sure that SROA doesn't lose nonnull metadata
; Check that we do basic propagation of nonnull when rewriting.
define i8* @propagate_nonnull(i32* %v) {
-; CHECK-LABEL: @propagate_nonnull(
+; CHECK-LABEL: define i8* @propagate_nonnull(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[A_SROA_1:%.*]] = alloca i8*, align 8
-; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[V:%.*]] to i8*
-; CHECK-NEXT: store i8* [[TMP0]], i8** [[A_SROA_1]], align 8
-; CHECK-NEXT: [[A_SROA_1_0_A_SROA_1_8_LOAD:%.*]] = load volatile i8*, i8** [[A_SROA_1]], align 8, !nonnull !0
-; CHECK-NEXT: ret i8* [[A_SROA_1_0_A_SROA_1_8_LOAD]]
-;
+; CHECK-NEXT: %[[A:.*]] = alloca i8*
+; CHECK-NEXT: %[[V_CAST:.*]] = bitcast i32* %v to i8*
+; CHECK-NEXT: store i8* %[[V_CAST]], i8** %[[A]]
+; CHECK-NEXT: %[[LOAD:.*]] = load volatile i8*, i8** %[[A]], align 8, !nonnull !0
+; CHECK-NEXT: ret i8* %[[LOAD]]
entry:
%a = alloca [2 x i8*]
%a.gep0 = getelementptr [2 x i8*], [2 x i8*]* %a, i32 0, i32 0
}
define float* @turn_nonnull_into_assume(float** %arg) {
-; CHECK-LABEL: @turn_nonnull_into_assume(
+; CHECK-LABEL: define float* @turn_nonnull_into_assume(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[BUF_0_COPYLOAD:%.*]] = load float*, float** [[ARG:%.*]], align 8
-; CHECK-NEXT: [[TMP0:%.*]] = icmp ne float* [[BUF_0_COPYLOAD]], null
-; CHECK-NEXT: call void @llvm.assume(i1 [[TMP0]])
-; CHECK-NEXT: ret float* [[BUF_0_COPYLOAD]]
-;
+; CHECK-NEXT: %[[RETURN:.*]] = load float*, float** %arg, align 8
+; CHECK-NEXT: %[[ASSUME:.*]] = icmp ne float* %[[RETURN]], null
+; CHECK-NEXT: call void @llvm.assume(i1 %[[ASSUME]])
+; CHECK-NEXT: ret float* %[[RETURN]]
entry:
%buf = alloca float*
%_arg_i8 = bitcast float** %arg to i8*
; *does* initially, but then we lose that !range metadata before we finish
; SROA.
define i8* @propagate_nonnull_to_int() {
-; CHECK-LABEL: @propagate_nonnull_to_int(
+; CHECK-LABEL: define i8* @propagate_nonnull_to_int(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[A_SROA_1:%.*]] = alloca i8*, align 8
-; CHECK-NEXT: store i8* inttoptr (i64 42 to i8*), i8** [[A_SROA_1]], align 8
-; CHECK-NEXT: [[A_SROA_1_0_A_SROA_1_8_LOAD:%.*]] = load volatile i8*, i8** [[A_SROA_1]], align 8, !nonnull !0
-; CHECK-NEXT: ret i8* [[A_SROA_1_0_A_SROA_1_8_LOAD]]
-;
+; CHECK-NEXT: %[[A:.*]] = alloca i8*
+; CHECK-NEXT: store i8* inttoptr (i64 42 to i8*), i8** %[[A]]
+; CHECK-NEXT: %[[LOAD:.*]] = load volatile i8*, i8** %[[A]]
+; CHECK-NEXT: ret i8* %[[LOAD]]
entry:
%a = alloca [2 x i8*]
%a.gep0 = getelementptr [2 x i8*], [2 x i8*]* %a, i32 0, i32 0
; register. This can fail in interesting ways due to the rewrite iteration of
; SROA, resulting in PR32902.
define i8* @propagate_nonnull_to_int_and_promote() {
-; CHECK-LABEL: @propagate_nonnull_to_int_and_promote(
+; CHECK-LABEL: define i8* @propagate_nonnull_to_int_and_promote(
; CHECK-NEXT: entry:
; CHECK-NEXT: ret i8* inttoptr (i64 42 to i8*)
-;
entry:
%a = alloca [2 x i8*], align 8
%a.gep0 = getelementptr [2 x i8*], [2 x i8*]* %a, i32 0, i32 0
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -passes=sroa -S | FileCheck %s
target datalayout = "e-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
; slices even in case of types that are skipped because their width is not a
; byte width multiple
define void @skipped_inttype_first({ i16*, i32 }*) {
-; CHECK-LABEL: @skipped_inttype_first(
-; CHECK-NEXT: [[ARG_SROA_0:%.*]] = alloca i8*, align 8
-; CHECK-NEXT: [[ARG_SROA_0_0__SROA_CAST:%.*]] = bitcast { i16*, i32 }* [[TMP0:%.*]] to i8**
-; CHECK-NEXT: [[ARG_SROA_0_0_COPYLOAD:%.*]] = load i8*, i8** [[ARG_SROA_0_0__SROA_CAST]], align 8
-; CHECK-NEXT: store i8* [[ARG_SROA_0_0_COPYLOAD]], i8** [[ARG_SROA_0]], align 8
-; CHECK-NEXT: [[ARG_SROA_3_0__SROA_IDX1:%.*]] = getelementptr inbounds { i16*, i32 }, { i16*, i32 }* [[TMP0]], i64 0, i32 1
-; CHECK-NEXT: [[ARG_SROA_3_0__SROA_CAST:%.*]] = bitcast i32* [[ARG_SROA_3_0__SROA_IDX1]] to i64*
-; CHECK-NEXT: [[ARG_SROA_3_0_COPYLOAD:%.*]] = load i64, i64* [[ARG_SROA_3_0__SROA_CAST]], align 8
-; CHECK-NEXT: [[ARG_SROA_0_0_PB0_SROA_CAST2:%.*]] = bitcast i8** [[ARG_SROA_0]] to i63*
-; CHECK-NEXT: [[ARG_SROA_0_0_ARG_SROA_0_0_B0:%.*]] = load i63, i63* [[ARG_SROA_0_0_PB0_SROA_CAST2]], align 8
-; CHECK-NEXT: [[ARG_SROA_0_0_ARG_SROA_0_0_B1:%.*]] = load i8*, i8** [[ARG_SROA_0]], align 8
-; CHECK-NEXT: ret void
-;
+; CHECK-LABEL: @skipped_inttype_first
+; CHECK: alloca i8*
%arg = alloca { i16*, i32 }, align 8
%2 = bitcast { i16*, i32 }* %0 to i8*
%3 = bitcast { i16*, i32 }* %arg to i8*
}
define void @skipped_inttype_last({ i16*, i32 }*) {
-; CHECK-LABEL: @skipped_inttype_last(
-; CHECK-NEXT: [[ARG_SROA_0:%.*]] = alloca i8*, align 8
-; CHECK-NEXT: [[ARG_SROA_0_0__SROA_CAST:%.*]] = bitcast { i16*, i32 }* [[TMP0:%.*]] to i8**
-; CHECK-NEXT: [[ARG_SROA_0_0_COPYLOAD:%.*]] = load i8*, i8** [[ARG_SROA_0_0__SROA_CAST]], align 8
-; CHECK-NEXT: store i8* [[ARG_SROA_0_0_COPYLOAD]], i8** [[ARG_SROA_0]], align 8
-; CHECK-NEXT: [[ARG_SROA_3_0__SROA_IDX1:%.*]] = getelementptr inbounds { i16*, i32 }, { i16*, i32 }* [[TMP0]], i64 0, i32 1
-; CHECK-NEXT: [[ARG_SROA_3_0__SROA_CAST:%.*]] = bitcast i32* [[ARG_SROA_3_0__SROA_IDX1]] to i64*
-; CHECK-NEXT: [[ARG_SROA_3_0_COPYLOAD:%.*]] = load i64, i64* [[ARG_SROA_3_0__SROA_CAST]], align 8
-; CHECK-NEXT: [[ARG_SROA_0_0_ARG_SROA_0_0_B1:%.*]] = load i8*, i8** [[ARG_SROA_0]], align 8
-; CHECK-NEXT: [[ARG_SROA_0_0_PB0_SROA_CAST2:%.*]] = bitcast i8** [[ARG_SROA_0]] to i63*
-; CHECK-NEXT: [[ARG_SROA_0_0_ARG_SROA_0_0_B0:%.*]] = load i63, i63* [[ARG_SROA_0_0_PB0_SROA_CAST2]], align 8
-; CHECK-NEXT: ret void
-;
+; CHECK-LABEL: @skipped_inttype_last
+; CHECK: alloca i8*
%arg = alloca { i16*, i32 }, align 8
%2 = bitcast { i16*, i32 }* %0 to i8*
%3 = bitcast { i16*, i32 }* %arg to i8*
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -passes=sroa -S | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
define <4 x i64> @vector_ptrtoint({<2 x i32*>, <2 x i32*>} %x) {
-; CHECK-LABEL: @vector_ptrtoint(
-; CHECK-NEXT: [[X_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i32*>, <2 x i32*> } [[X:%.*]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint <2 x i32*> [[X_FCA_0_EXTRACT]] to <2 x i64>
-; CHECK-NEXT: [[A_SROA_0_0_VEC_EXPAND:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; CHECK-NEXT: [[A_SROA_0_0_VECBLEND:%.*]] = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x i64> [[A_SROA_0_0_VEC_EXPAND]], <4 x i64> undef
-; CHECK-NEXT: [[X_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i32*>, <2 x i32*> } [[X]], 1
-; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint <2 x i32*> [[X_FCA_1_EXTRACT]] to <2 x i64>
-; CHECK-NEXT: [[A_SROA_0_16_VEC_EXPAND:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
-; CHECK-NEXT: [[A_SROA_0_16_VECBLEND:%.*]] = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x i64> [[A_SROA_0_16_VEC_EXPAND]], <4 x i64> [[A_SROA_0_0_VECBLEND]]
-; CHECK-NEXT: ret <4 x i64> [[A_SROA_0_16_VECBLEND]]
-;
+; CHECK-LABEL: @vector_ptrtoint
%a = alloca {<2 x i32*>, <2 x i32*>}
+; CHECK-NOT: alloca
store {<2 x i32*>, <2 x i32*>} %x, {<2 x i32*>, <2 x i32*>}* %a
+; CHECK-NOT: store
%cast = bitcast {<2 x i32*>, <2 x i32*>}* %a to <4 x i64>*
%vec = load <4 x i64>, <4 x i64>* %cast
+; CHECK-NOT: load
+; CHECK: ptrtoint
ret <4 x i64> %vec
}
define <4 x i32*> @vector_inttoptr({<2 x i64>, <2 x i64>} %x) {
-; CHECK-LABEL: @vector_inttoptr(
-; CHECK-NEXT: [[X_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[X:%.*]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = inttoptr <2 x i64> [[X_FCA_0_EXTRACT]] to <2 x i32*>
-; CHECK-NEXT: [[A_SROA_0_0_VEC_EXPAND:%.*]] = shufflevector <2 x i32*> [[TMP1]], <2 x i32*> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; CHECK-NEXT: [[A_SROA_0_0_VECBLEND:%.*]] = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x i32*> [[A_SROA_0_0_VEC_EXPAND]], <4 x i32*> undef
-; CHECK-NEXT: [[X_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[X]], 1
-; CHECK-NEXT: [[TMP2:%.*]] = inttoptr <2 x i64> [[X_FCA_1_EXTRACT]] to <2 x i32*>
-; CHECK-NEXT: [[A_SROA_0_16_VEC_EXPAND:%.*]] = shufflevector <2 x i32*> [[TMP2]], <2 x i32*> poison, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
-; CHECK-NEXT: [[A_SROA_0_16_VECBLEND:%.*]] = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x i32*> [[A_SROA_0_16_VEC_EXPAND]], <4 x i32*> [[A_SROA_0_0_VECBLEND]]
-; CHECK-NEXT: ret <4 x i32*> [[A_SROA_0_16_VECBLEND]]
-;
+; CHECK-LABEL: @vector_inttoptr
%a = alloca {<2 x i64>, <2 x i64>}
+; CHECK-NOT: alloca
store {<2 x i64>, <2 x i64>} %x, {<2 x i64>, <2 x i64>}* %a
+; CHECK-NOT: store
%cast = bitcast {<2 x i64>, <2 x i64>}* %a to <4 x i32*>*
%vec = load <4 x i32*>, <4 x i32*>* %cast
+; CHECK-NOT: load
+; CHECK: inttoptr
ret <4 x i32*> %vec
}
define <2 x i64> @vector_ptrtointbitcast({<1 x i32*>, <1 x i32*>} %x) {
; CHECK-LABEL: @vector_ptrtointbitcast(
-; CHECK-NEXT: [[X_FCA_0_EXTRACT:%.*]] = extractvalue { <1 x i32*>, <1 x i32*> } [[X:%.*]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint <1 x i32*> [[X_FCA_0_EXTRACT]] to <1 x i64>
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i64> [[TMP1]] to i64
-; CHECK-NEXT: [[A_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i32 0
-; CHECK-NEXT: [[X_FCA_1_EXTRACT:%.*]] = extractvalue { <1 x i32*>, <1 x i32*> } [[X]], 1
-; CHECK-NEXT: [[TMP3:%.*]] = ptrtoint <1 x i32*> [[X_FCA_1_EXTRACT]] to <1 x i64>
-; CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to i64
-; CHECK-NEXT: [[A_SROA_0_8_VEC_INSERT:%.*]] = insertelement <2 x i64> [[A_SROA_0_0_VEC_INSERT]], i64 [[TMP4]], i32 1
-; CHECK-NEXT: ret <2 x i64> [[A_SROA_0_8_VEC_INSERT]]
-;
%a = alloca {<1 x i32*>, <1 x i32*>}
+; CHECK-NOT: alloca
store {<1 x i32*>, <1 x i32*>} %x, {<1 x i32*>, <1 x i32*>}* %a
+; CHECK-NOT: store
%cast = bitcast {<1 x i32*>, <1 x i32*>}* %a to <2 x i64>*
%vec = load <2 x i64>, <2 x i64>* %cast
+; CHECK-NOT: load
+; CHECK: ptrtoint
+; CHECK: bitcast
+; CHECK: ptrtoint
+; CHECK: bitcast
ret <2 x i64> %vec
}
define <2 x i8*> @vector_inttoptrbitcast_vector({<16 x i8>, <16 x i8>} %x) {
; CHECK-LABEL: @vector_inttoptrbitcast_vector(
-; CHECK-NEXT: [[X_FCA_0_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[X:%.*]], 0
-; CHECK-NEXT: [[X_FCA_1_EXTRACT:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[X]], 1
-; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[X_FCA_0_EXTRACT]] to <2 x i64>
-; CHECK-NEXT: [[TMP2:%.*]] = inttoptr <2 x i64> [[TMP1]] to <2 x i8*>
-; CHECK-NEXT: ret <2 x i8*> [[TMP2]]
-;
%a = alloca {<16 x i8>, <16 x i8>}
+; CHECK-NOT: alloca
store {<16 x i8>, <16 x i8>} %x, {<16 x i8>, <16 x i8>}* %a
+; CHECK-NOT: store
%cast = bitcast {<16 x i8>, <16 x i8>}* %a to <2 x i8*>*
%vec = load <2 x i8*>, <2 x i8*>* %cast
+; CHECK-NOT: load
+; CHECK: extractvalue
+; CHECK: extractvalue
+; CHECK: bitcast
+; CHECK: inttoptr
ret <2 x i8*> %vec
}
define <16 x i8> @vector_ptrtointbitcast_vector({<2 x i8*>, <2 x i8*>} %x) {
; CHECK-LABEL: @vector_ptrtointbitcast_vector(
-; CHECK-NEXT: [[X_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x i8*>, <2 x i8*> } [[X:%.*]], 0
-; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint <2 x i8*> [[X_FCA_0_EXTRACT]] to <2 x i64>
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[TMP1]] to <16 x i8>
-; CHECK-NEXT: [[X_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x i8*>, <2 x i8*> } [[X]], 1
-; CHECK-NEXT: ret <16 x i8> [[TMP2]]
-;
%a = alloca {<2 x i8*>, <2 x i8*>}
+; CHECK-NOT: alloca
store {<2 x i8*>, <2 x i8*>} %x, {<2 x i8*>, <2 x i8*>}* %a
+; CHECK-NOT: store
%cast = bitcast {<2 x i8*>, <2 x i8*>}* %a to <16 x i8>*
%vec = load <16 x i8>, <16 x i8>* %cast
+; CHECK-NOT: load
+; CHECK: extractvalue
+; CHECK: ptrtoint
+; CHECK: bitcast
+; CHECK: extractvalue
ret <16 x i8> %vec
}
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -passes=sroa -S | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
define <4 x i1> @vector_bitcast() {
-; CHECK-LABEL: @vector_bitcast(
-; CHECK-NEXT: [[A:%.*]] = alloca <3 x i1>, align 1
-; CHECK-NEXT: store <3 x i1> <i1 true, i1 false, i1 true>, <3 x i1>* [[A]], align 1
-; CHECK-NEXT: [[A_0_CAST_SROA_CAST:%.*]] = bitcast <3 x i1>* [[A]] to <4 x i1>*
-; CHECK-NEXT: [[A_0_VEC:%.*]] = load <4 x i1>, <4 x i1>* [[A_0_CAST_SROA_CAST]], align 1
-; CHECK-NEXT: ret <4 x i1> [[A_0_VEC]]
-;
+ ; CHECK-LABEL: @vector_bitcast
+ ; CHECK: alloca <3 x i1>
- %a = alloca <3 x i1>
- store <3 x i1> <i1 1,i1 0,i1 1>, <3 x i1>* %a
- %cast = bitcast <3 x i1>* %a to <4 x i1>*
- %vec = load <4 x i1>, <4 x i1>* %cast
- ret <4 x i1> %vec
+ %a = alloca <3 x i1>
+ store <3 x i1> <i1 1,i1 0,i1 1>, <3 x i1>* %a
+ %cast = bitcast <3 x i1>* %a to <4 x i1>*
+ %vec = load <4 x i1>, <4 x i1>* %cast
+ ret <4 x i1> %vec
}
define void @vector_bitcast_2() {
-; CHECK-LABEL: @vector_bitcast_2(
-; CHECK-NEXT: %"sum$1.host2" = alloca <32 x i16>, align 64
-; CHECK-NEXT: store <32 x i16> undef, <32 x i16>* %"sum$1.host2", align 64
-; CHECK-NEXT: %"sum$1.host2.0.bc.sroa_cast" = bitcast <32 x i16>* %"sum$1.host2" to <64 x i16>*
-; CHECK-NEXT: %"sum$1.host2.0.bcl" = load <64 x i16>, <64 x i16>* %"sum$1.host2.0.bc.sroa_cast", align 64
-; CHECK-NEXT: ret void
-;
+ ; CHECK-LABEL: @vector_bitcast_2
+ ; CHECK: alloca <32 x i16>
- %"sum$1.host2" = alloca <32 x i16>
- store <32 x i16> undef, <32 x i16>* %"sum$1.host2"
- %bc = bitcast <32 x i16>* %"sum$1.host2" to <64 x i16>*
- %bcl = load <64 x i16>, <64 x i16>* %bc
- ret void
+ %"sum$1.host2" = alloca <32 x i16>
+ store <32 x i16> undef, <32 x i16>* %"sum$1.host2"
+ %bc = bitcast <32 x i16>* %"sum$1.host2" to <64 x i16>*
+ %bcl = load <64 x i16>, <64 x i16>* %bc
+ ret void
}