From 42bcec7d38a5fd707f446719a9cf859f9fdbc1ef Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Tue, 20 Feb 2018 19:49:25 +0000 Subject: [PATCH] [LV] Fix test checks, NFC. llvm-svn: 325617 --- .../Transforms/LoopVectorize/X86/gather_scatter.ll | 768 +++++- .../LoopVectorize/X86/masked_load_store.ll | 2878 +++++++++++++++++++- 2 files changed, 3506 insertions(+), 140 deletions(-) diff --git a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll index 8ef5961..9217c2e 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/gather_scatter.ll @@ -1,7 +1,6 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -O3 -mcpu=knl -S | FileCheck %s -check-prefix=AVX512 -;AVX1-NOT: llvm.masked - target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-pc_linux" @@ -16,14 +15,78 @@ target triple = "x86_64-pc_linux" ; } ;} -;AVX512-LABEL: @foo1 -;AVX512: llvm.masked.load.v16i32.p0v16i32 -;AVX512: llvm.masked.gather.v16f32.v16p0f32 -;AVX512: llvm.masked.store.v16f32.p0v16f32 -;AVX512: ret void - ; Function Attrs: nounwind uwtable define void @foo1(float* noalias %in, float* noalias %out, i32* noalias %trigger, i32* noalias %index) { +; AVX512-LABEL: @foo1( +; AVX512-NEXT: entry: +; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] +; AVX512: vector.body: +; AVX512-NEXT: [[INDEX6:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY]] ] +; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDEX6]] +; AVX512-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <16 x i32>* +; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP1]], align 4 +; AVX512-NEXT: [[TMP2:%.*]] = icmp sgt <16 x i32> [[WIDE_LOAD]], zeroinitializer +; AVX512-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[INDEX:%.*]], i64 [[INDEX6]] +; AVX512-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <16 x i32>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP4]], i32 4, <16 x i1> [[TMP2]], <16 x i32> undef) +; AVX512-NEXT: [[TMP5:%.*]] = sext <16 x i32> [[WIDE_MASKED_LOAD]] to <16 x i64> +; AVX512-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, float* [[IN:%.*]], <16 x i64> [[TMP5]] +; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP6]], i32 4, <16 x i1> [[TMP2]], <16 x float> undef) +; AVX512-NEXT: [[TMP7:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER]], +; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], i64 [[INDEX6]] +; AVX512-NEXT: [[TMP9:%.*]] = bitcast float* [[TMP8]] to <16 x float>* +; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP7]], <16 x float>* [[TMP9]], i32 4, <16 x i1> [[TMP2]]) +; AVX512-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX6]], 16 +; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT]] +; AVX512-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <16 x i32>* +; AVX512-NEXT: [[WIDE_LOAD_1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP11]], align 4 +; AVX512-NEXT: [[TMP12:%.*]] = icmp sgt <16 x i32> [[WIDE_LOAD_1]], zeroinitializer +; AVX512-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 [[INDEX_NEXT]] +; AVX512-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP13]] to <16 x i32>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP14]], i32 4, <16 x i1> [[TMP12]], <16 x i32> undef) +; AVX512-NEXT: [[TMP15:%.*]] = sext <16 x i32> [[WIDE_MASKED_LOAD_1]] to <16 x i64> +; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[IN]], <16 x i64> [[TMP15]] +; AVX512-NEXT: [[WIDE_MASKED_GATHER_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP16]], i32 4, <16 x i1> [[TMP12]], <16 x float> undef) +; AVX512-NEXT: [[TMP17:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER_1]], +; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDEX_NEXT]] +; AVX512-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP18]] to <16 x float>* +; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP17]], <16 x float>* [[TMP19]], i32 4, <16 x i1> [[TMP12]]) +; AVX512-NEXT: [[INDEX_NEXT_1:%.*]] = or i64 [[INDEX6]], 32 +; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT_1]] +; AVX512-NEXT: [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <16 x i32>* +; AVX512-NEXT: [[WIDE_LOAD_2:%.*]] = load <16 x i32>, <16 x i32>* [[TMP21]], align 4 +; AVX512-NEXT: [[TMP22:%.*]] = icmp sgt <16 x i32> [[WIDE_LOAD_2]], zeroinitializer +; AVX512-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 [[INDEX_NEXT_1]] +; AVX512-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to <16 x i32>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD_2:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP24]], i32 4, <16 x i1> [[TMP22]], <16 x i32> undef) +; AVX512-NEXT: [[TMP25:%.*]] = sext <16 x i32> [[WIDE_MASKED_LOAD_2]] to <16 x i64> +; AVX512-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, float* [[IN]], <16 x i64> [[TMP25]] +; AVX512-NEXT: [[WIDE_MASKED_GATHER_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP26]], i32 4, <16 x i1> [[TMP22]], <16 x float> undef) +; AVX512-NEXT: [[TMP27:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER_2]], +; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDEX_NEXT_1]] +; AVX512-NEXT: [[TMP29:%.*]] = bitcast float* [[TMP28]] to <16 x float>* +; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP27]], <16 x float>* [[TMP29]], i32 4, <16 x i1> [[TMP22]]) +; AVX512-NEXT: [[INDEX_NEXT_2:%.*]] = or i64 [[INDEX6]], 48 +; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT_2]] +; AVX512-NEXT: [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <16 x i32>* +; AVX512-NEXT: [[WIDE_LOAD_3:%.*]] = load <16 x i32>, <16 x i32>* [[TMP31]], align 4 +; AVX512-NEXT: [[TMP32:%.*]] = icmp sgt <16 x i32> [[WIDE_LOAD_3]], zeroinitializer +; AVX512-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[INDEX]], i64 [[INDEX_NEXT_2]] +; AVX512-NEXT: [[TMP34:%.*]] = bitcast i32* [[TMP33]] to <16 x i32>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD_3:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP34]], i32 4, <16 x i1> [[TMP32]], <16 x i32> undef) +; AVX512-NEXT: [[TMP35:%.*]] = sext <16 x i32> [[WIDE_MASKED_LOAD_3]] to <16 x i64> +; AVX512-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, float* [[IN]], <16 x i64> [[TMP35]] +; AVX512-NEXT: [[WIDE_MASKED_GATHER_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP36]], i32 4, <16 x i1> [[TMP32]], <16 x float> undef) +; AVX512-NEXT: [[TMP37:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER_3]], +; AVX512-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, float* [[OUT]], i64 [[INDEX_NEXT_2]] +; AVX512-NEXT: [[TMP39:%.*]] = bitcast float* [[TMP38]] to <16 x float>* +; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP37]], <16 x float>* [[TMP39]], i32 4, <16 x i1> [[TMP32]]) +; AVX512-NEXT: [[INDEX_NEXT_3]] = add nuw nsw i64 [[INDEX6]], 64 +; AVX512-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT_3]], 4096 +; AVX512-NEXT: br i1 [[TMP40]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 +; AVX512: for.end: +; AVX512-NEXT: ret void +; entry: %in.addr = alloca float*, align 8 %out.addr = alloca float*, align 8 @@ -94,12 +157,139 @@ for.end: ; preds = %for.cond %struct.In = type { float, float } -;AVX512-LABEL: @foo2 -;AVX512: getelementptr inbounds %struct.In, %struct.In* %in, <16 x i64> {{.*}}, i32 1 -;AVX512: llvm.masked.gather.v16f32.v16p0f32 -;AVX512: llvm.masked.scatter.v16f32.v16p0f32 -;AVX512: ret void define void @foo2(%struct.In* noalias %in, float* noalias %out, i32* noalias %trigger, i32* noalias %index) #0 { +; AVX512-LABEL: @foo2( +; AVX512-NEXT: entry: +; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer +; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef) +; AVX512-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7]], +; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP3]], <16 x float*> [[TMP4]], i32 4, <16 x i1> [[TMP1]]) +; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_1:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP5]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP6:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_1]], zeroinitializer +; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP7]], i32 4, <16 x i1> [[TMP6]], <16 x float> undef) +; AVX512-NEXT: [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_1]], +; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP8]], <16 x float*> [[TMP9]], i32 4, <16 x i1> [[TMP6]]) +; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_2:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP10]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP11:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_2]], zeroinitializer +; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP12]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef) +; AVX512-NEXT: [[TMP13:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_2]], +; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP13]], <16 x float*> [[TMP14]], i32 4, <16 x i1> [[TMP11]]) +; AVX512-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_3:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP15]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP16:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_3]], zeroinitializer +; AVX512-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP17]], i32 4, <16 x i1> [[TMP16]], <16 x float> undef) +; AVX512-NEXT: [[TMP18:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_3]], +; AVX512-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP18]], <16 x float*> [[TMP19]], i32 4, <16 x i1> [[TMP16]]) +; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_4:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP20]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP21:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_4]], zeroinitializer +; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_4:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP22]], i32 4, <16 x i1> [[TMP21]], <16 x float> undef) +; AVX512-NEXT: [[TMP23:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_4]], +; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP23]], <16 x float*> [[TMP24]], i32 4, <16 x i1> [[TMP21]]) +; AVX512-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_5:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP25]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP26:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_5]], zeroinitializer +; AVX512-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP27]], i32 4, <16 x i1> [[TMP26]], <16 x float> undef) +; AVX512-NEXT: [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_5]], +; AVX512-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP28]], <16 x float*> [[TMP29]], i32 4, <16 x i1> [[TMP26]]) +; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_6:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP30]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP31:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_6]], zeroinitializer +; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP32]], i32 4, <16 x i1> [[TMP31]], <16 x float> undef) +; AVX512-NEXT: [[TMP33:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_6]], +; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP33]], <16 x float*> [[TMP34]], i32 4, <16 x i1> [[TMP31]]) +; AVX512-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_7:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP35]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP36:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_7]], zeroinitializer +; AVX512-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP37]], i32 4, <16 x i1> [[TMP36]], <16 x float> undef) +; AVX512-NEXT: [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_7]], +; AVX512-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP38]], <16 x float*> [[TMP39]], i32 4, <16 x i1> [[TMP36]]) +; AVX512-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_8:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP40]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP41:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_8]], zeroinitializer +; AVX512-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP42]], i32 4, <16 x i1> [[TMP41]], <16 x float> undef) +; AVX512-NEXT: [[TMP43:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_8]], +; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP43]], <16 x float*> [[TMP44]], i32 4, <16 x i1> [[TMP41]]) +; AVX512-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_9:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP45]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP46:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_9]], zeroinitializer +; AVX512-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_9:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP47]], i32 4, <16 x i1> [[TMP46]], <16 x float> undef) +; AVX512-NEXT: [[TMP48:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_9]], +; AVX512-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP48]], <16 x float*> [[TMP49]], i32 4, <16 x i1> [[TMP46]]) +; AVX512-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_10:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP50]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP51:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_10]], zeroinitializer +; AVX512-NEXT: [[TMP52:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_10:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP52]], i32 4, <16 x i1> [[TMP51]], <16 x float> undef) +; AVX512-NEXT: [[TMP53:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_10]], +; AVX512-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP53]], <16 x float*> [[TMP54]], i32 4, <16 x i1> [[TMP51]]) +; AVX512-NEXT: [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_11:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP55]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP56:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_11]], zeroinitializer +; AVX512-NEXT: [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_11:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP57]], i32 4, <16 x i1> [[TMP56]], <16 x float> undef) +; AVX512-NEXT: [[TMP58:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_11]], +; AVX512-NEXT: [[TMP59:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP58]], <16 x float*> [[TMP59]], i32 4, <16 x i1> [[TMP56]]) +; AVX512-NEXT: [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_12:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP60]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP61:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_12]], zeroinitializer +; AVX512-NEXT: [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_12:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP62]], i32 4, <16 x i1> [[TMP61]], <16 x float> undef) +; AVX512-NEXT: [[TMP63:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_12]], +; AVX512-NEXT: [[TMP64:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP63]], <16 x float*> [[TMP64]], i32 4, <16 x i1> [[TMP61]]) +; AVX512-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_13:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP65]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP66:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_13]], zeroinitializer +; AVX512-NEXT: [[TMP67:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_13:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP67]], i32 4, <16 x i1> [[TMP66]], <16 x float> undef) +; AVX512-NEXT: [[TMP68:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_13]], +; AVX512-NEXT: [[TMP69:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP68]], <16 x float*> [[TMP69]], i32 4, <16 x i1> [[TMP66]]) +; AVX512-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_14:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP70]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP71:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_14]], zeroinitializer +; AVX512-NEXT: [[TMP72:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_14:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP72]], i32 4, <16 x i1> [[TMP71]], <16 x float> undef) +; AVX512-NEXT: [[TMP73:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_14]], +; AVX512-NEXT: [[TMP74:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP73]], <16 x float*> [[TMP74]], i32 4, <16 x i1> [[TMP71]]) +; AVX512-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_15:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP75]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP76:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_15]], zeroinitializer +; AVX512-NEXT: [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_15:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP77]], i32 4, <16 x i1> [[TMP76]], <16 x float> undef) +; AVX512-NEXT: [[TMP78:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_15]], +; AVX512-NEXT: [[TMP79:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP78]], <16 x float*> [[TMP79]], i32 4, <16 x i1> [[TMP76]]) +; AVX512-NEXT: ret void +; entry: %in.addr = alloca %struct.In*, align 8 %out.addr = alloca float*, align 8 @@ -169,17 +359,141 @@ for.end: ; preds = %for.cond ; } ;} -;AVX512-LABEL: @foo3 -;AVX512: getelementptr inbounds %struct.In, %struct.In* %in, <16 x i64> {{.*}}, i32 1 -;AVX512: llvm.masked.gather.v16f32.v16p0f32 -;AVX512: fadd <16 x float> -;AVX512: getelementptr inbounds %struct.Out, %struct.Out* %out, <16 x i64> {{.*}}, i32 1 -;AVX512: llvm.masked.scatter.v16f32.v16p0f32 -;AVX512: ret void - %struct.Out = type { float, float } define void @foo3(%struct.In* noalias %in, %struct.Out* noalias %out, i32* noalias %trigger) { +; AVX512-LABEL: @foo3( +; AVX512-NEXT: entry: +; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer +; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef) +; AVX512-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6]], +; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_OUT:%.*]], %struct.Out* [[OUT:%.*]], <16 x i64> , i32 1 +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP3]], <16 x float*> [[TMP4]], i32 4, <16 x i1> [[TMP1]]) +; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_1:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP5]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP6:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_1]], zeroinitializer +; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER6_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP7]], i32 4, <16 x i1> [[TMP6]], <16 x float> undef) +; AVX512-NEXT: [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_1]], +; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP8]], <16 x float*> [[TMP9]], i32 4, <16 x i1> [[TMP6]]) +; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_2:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP10]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP11:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_2]], zeroinitializer +; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER6_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP12]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef) +; AVX512-NEXT: [[TMP13:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_2]], +; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP13]], <16 x float*> [[TMP14]], i32 4, <16 x i1> [[TMP11]]) +; AVX512-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_3:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP15]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP16:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_3]], zeroinitializer +; AVX512-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER6_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP17]], i32 4, <16 x i1> [[TMP16]], <16 x float> undef) +; AVX512-NEXT: [[TMP18:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_3]], +; AVX512-NEXT: [[TMP19:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP18]], <16 x float*> [[TMP19]], i32 4, <16 x i1> [[TMP16]]) +; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_4:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP20]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP21:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_4]], zeroinitializer +; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER6_4:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP22]], i32 4, <16 x i1> [[TMP21]], <16 x float> undef) +; AVX512-NEXT: [[TMP23:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_4]], +; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP23]], <16 x float*> [[TMP24]], i32 4, <16 x i1> [[TMP21]]) +; AVX512-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_5:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP25]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP26:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_5]], zeroinitializer +; AVX512-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER6_5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP27]], i32 4, <16 x i1> [[TMP26]], <16 x float> undef) +; AVX512-NEXT: [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_5]], +; AVX512-NEXT: [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP28]], <16 x float*> [[TMP29]], i32 4, <16 x i1> [[TMP26]]) +; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_6:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP30]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP31:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_6]], zeroinitializer +; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER6_6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP32]], i32 4, <16 x i1> [[TMP31]], <16 x float> undef) +; AVX512-NEXT: [[TMP33:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_6]], +; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP33]], <16 x float*> [[TMP34]], i32 4, <16 x i1> [[TMP31]]) +; AVX512-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_7:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP35]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP36:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_7]], zeroinitializer +; AVX512-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER6_7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP37]], i32 4, <16 x i1> [[TMP36]], <16 x float> undef) +; AVX512-NEXT: [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_7]], +; AVX512-NEXT: [[TMP39:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP38]], <16 x float*> [[TMP39]], i32 4, <16 x i1> [[TMP36]]) +; AVX512-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_8:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP40]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP41:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_8]], zeroinitializer +; AVX512-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER6_8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP42]], i32 4, <16 x i1> [[TMP41]], <16 x float> undef) +; AVX512-NEXT: [[TMP43:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_8]], +; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP43]], <16 x float*> [[TMP44]], i32 4, <16 x i1> [[TMP41]]) +; AVX512-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_9:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP45]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP46:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_9]], zeroinitializer +; AVX512-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER6_9:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP47]], i32 4, <16 x i1> [[TMP46]], <16 x float> undef) +; AVX512-NEXT: [[TMP48:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_9]], +; AVX512-NEXT: [[TMP49:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP48]], <16 x float*> [[TMP49]], i32 4, <16 x i1> [[TMP46]]) +; AVX512-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_10:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP50]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP51:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_10]], zeroinitializer +; AVX512-NEXT: [[TMP52:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER6_10:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP52]], i32 4, <16 x i1> [[TMP51]], <16 x float> undef) +; AVX512-NEXT: [[TMP53:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_10]], +; AVX512-NEXT: [[TMP54:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP53]], <16 x float*> [[TMP54]], i32 4, <16 x i1> [[TMP51]]) +; AVX512-NEXT: [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_11:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP55]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP56:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_11]], zeroinitializer +; AVX512-NEXT: [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER6_11:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP57]], i32 4, <16 x i1> [[TMP56]], <16 x float> undef) +; AVX512-NEXT: [[TMP58:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_11]], +; AVX512-NEXT: [[TMP59:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP58]], <16 x float*> [[TMP59]], i32 4, <16 x i1> [[TMP56]]) +; AVX512-NEXT: [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_12:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP60]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP61:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_12]], zeroinitializer +; AVX512-NEXT: [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER6_12:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP62]], i32 4, <16 x i1> [[TMP61]], <16 x float> undef) +; AVX512-NEXT: [[TMP63:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_12]], +; AVX512-NEXT: [[TMP64:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP63]], <16 x float*> [[TMP64]], i32 4, <16 x i1> [[TMP61]]) +; AVX512-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_13:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP65]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP66:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_13]], zeroinitializer +; AVX512-NEXT: [[TMP67:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER6_13:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP67]], i32 4, <16 x i1> [[TMP66]], <16 x float> undef) +; AVX512-NEXT: [[TMP68:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_13]], +; AVX512-NEXT: [[TMP69:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP68]], <16 x float*> [[TMP69]], i32 4, <16 x i1> [[TMP66]]) +; AVX512-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_14:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP70]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP71:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_14]], zeroinitializer +; AVX512-NEXT: [[TMP72:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER6_14:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP72]], i32 4, <16 x i1> [[TMP71]], <16 x float> undef) +; AVX512-NEXT: [[TMP73:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_14]], +; AVX512-NEXT: [[TMP74:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP73]], <16 x float*> [[TMP74]], i32 4, <16 x i1> [[TMP71]]) +; AVX512-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_15:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP75]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP76:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_15]], zeroinitializer +; AVX512-NEXT: [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER6_15:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP77]], i32 4, <16 x i1> [[TMP76]], <16 x float> undef) +; AVX512-NEXT: [[TMP78:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER6_15]], +; AVX512-NEXT: [[TMP79:%.*]] = getelementptr inbounds [[STRUCT_OUT]], %struct.Out* [[OUT]], <16 x i64> , i32 1 +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP78]], <16 x float*> [[TMP79]], i32 4, <16 x i1> [[TMP76]]) +; AVX512-NEXT: ret void +; entry: %in.addr = alloca %struct.In*, align 8 %out.addr = alloca %struct.Out*, align 8 @@ -237,12 +551,139 @@ declare void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float>, <16 x float*>, i ; The same as @foo2 but scatter/gather argument is a vecotr of ptrs with addresspace 1 -;AVX512-LABEL: @foo2_addrspace -;AVX512: getelementptr inbounds %struct.In, %struct.In addrspace(1)* %in, <16 x i64> {{.*}}, i32 1 -;AVX512: llvm.masked.gather.v16f32.v16p1f32 -;AVX512: llvm.masked.scatter.v16f32.v16p1f32 -;AVX512: ret void define void @foo2_addrspace(%struct.In addrspace(1)* noalias %in, float addrspace(1)* noalias %out, i32* noalias %trigger, i32* noalias %index) #0 { +; AVX512-LABEL: @foo2_addrspace( +; AVX512-NEXT: entry: +; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer +; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef) +; AVX512-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7]], +; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP3]], <16 x float addrspace(1)*> [[TMP4]], i32 4, <16 x i1> [[TMP1]]) +; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_1:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP5]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP6:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_1]], zeroinitializer +; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP7]], i32 4, <16 x i1> [[TMP6]], <16 x float> undef) +; AVX512-NEXT: [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_1]], +; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP8]], <16 x float addrspace(1)*> [[TMP9]], i32 4, <16 x i1> [[TMP6]]) +; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_2:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP10]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP11:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_2]], zeroinitializer +; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP12]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef) +; AVX512-NEXT: [[TMP13:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_2]], +; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP13]], <16 x float addrspace(1)*> [[TMP14]], i32 4, <16 x i1> [[TMP11]]) +; AVX512-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_3:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP15]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP16:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_3]], zeroinitializer +; AVX512-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP17]], i32 4, <16 x i1> [[TMP16]], <16 x float> undef) +; AVX512-NEXT: [[TMP18:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_3]], +; AVX512-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP18]], <16 x float addrspace(1)*> [[TMP19]], i32 4, <16 x i1> [[TMP16]]) +; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_4:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP20]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP21:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_4]], zeroinitializer +; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_4:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP22]], i32 4, <16 x i1> [[TMP21]], <16 x float> undef) +; AVX512-NEXT: [[TMP23:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_4]], +; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP23]], <16 x float addrspace(1)*> [[TMP24]], i32 4, <16 x i1> [[TMP21]]) +; AVX512-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_5:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP25]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP26:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_5]], zeroinitializer +; AVX512-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP27]], i32 4, <16 x i1> [[TMP26]], <16 x float> undef) +; AVX512-NEXT: [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_5]], +; AVX512-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP28]], <16 x float addrspace(1)*> [[TMP29]], i32 4, <16 x i1> [[TMP26]]) +; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_6:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP30]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP31:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_6]], zeroinitializer +; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP32]], i32 4, <16 x i1> [[TMP31]], <16 x float> undef) +; AVX512-NEXT: [[TMP33:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_6]], +; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP33]], <16 x float addrspace(1)*> [[TMP34]], i32 4, <16 x i1> [[TMP31]]) +; AVX512-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_7:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP35]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP36:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_7]], zeroinitializer +; AVX512-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP37]], i32 4, <16 x i1> [[TMP36]], <16 x float> undef) +; AVX512-NEXT: [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_7]], +; AVX512-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP38]], <16 x float addrspace(1)*> [[TMP39]], i32 4, <16 x i1> [[TMP36]]) +; AVX512-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_8:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP40]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP41:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_8]], zeroinitializer +; AVX512-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP42]], i32 4, <16 x i1> [[TMP41]], <16 x float> undef) +; AVX512-NEXT: [[TMP43:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_8]], +; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP43]], <16 x float addrspace(1)*> [[TMP44]], i32 4, <16 x i1> [[TMP41]]) +; AVX512-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_9:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP45]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP46:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_9]], zeroinitializer +; AVX512-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_9:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP47]], i32 4, <16 x i1> [[TMP46]], <16 x float> undef) +; AVX512-NEXT: [[TMP48:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_9]], +; AVX512-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP48]], <16 x float addrspace(1)*> [[TMP49]], i32 4, <16 x i1> [[TMP46]]) +; AVX512-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_10:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP50]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP51:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_10]], zeroinitializer +; AVX512-NEXT: [[TMP52:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_10:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP52]], i32 4, <16 x i1> [[TMP51]], <16 x float> undef) +; AVX512-NEXT: [[TMP53:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_10]], +; AVX512-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP53]], <16 x float addrspace(1)*> [[TMP54]], i32 4, <16 x i1> [[TMP51]]) +; AVX512-NEXT: [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_11:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP55]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP56:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_11]], zeroinitializer +; AVX512-NEXT: [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_11:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP57]], i32 4, <16 x i1> [[TMP56]], <16 x float> undef) +; AVX512-NEXT: [[TMP58:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_11]], +; AVX512-NEXT: [[TMP59:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP58]], <16 x float addrspace(1)*> [[TMP59]], i32 4, <16 x i1> [[TMP56]]) +; AVX512-NEXT: [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_12:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP60]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP61:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_12]], zeroinitializer +; AVX512-NEXT: [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_12:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP62]], i32 4, <16 x i1> [[TMP61]], <16 x float> undef) +; AVX512-NEXT: [[TMP63:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_12]], +; AVX512-NEXT: [[TMP64:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP63]], <16 x float addrspace(1)*> [[TMP64]], i32 4, <16 x i1> [[TMP61]]) +; AVX512-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_13:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP65]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP66:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_13]], zeroinitializer +; AVX512-NEXT: [[TMP67:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_13:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP67]], i32 4, <16 x i1> [[TMP66]], <16 x float> undef) +; AVX512-NEXT: [[TMP68:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_13]], +; AVX512-NEXT: [[TMP69:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP68]], <16 x float addrspace(1)*> [[TMP69]], i32 4, <16 x i1> [[TMP66]]) +; AVX512-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_14:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP70]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP71:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_14]], zeroinitializer +; AVX512-NEXT: [[TMP72:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_14:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP72]], i32 4, <16 x i1> [[TMP71]], <16 x float> undef) +; AVX512-NEXT: [[TMP73:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_14]], +; AVX512-NEXT: [[TMP74:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP73]], <16 x float addrspace(1)*> [[TMP74]], i32 4, <16 x i1> [[TMP71]]) +; AVX512-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_15:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP75]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP76:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_15]], zeroinitializer +; AVX512-NEXT: [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_15:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP77]], i32 4, <16 x i1> [[TMP76]], <16 x float> undef) +; AVX512-NEXT: [[TMP78:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_15]], +; AVX512-NEXT: [[TMP79:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP78]], <16 x float addrspace(1)*> [[TMP79]], i32 4, <16 x i1> [[TMP76]]) +; AVX512-NEXT: ret void +; entry: %in.addr = alloca %struct.In addrspace(1)*, align 8 %out.addr = alloca float addrspace(1)*, align 8 @@ -300,12 +741,139 @@ for.end: ; preds = %for.cond ; Same as foo2_addrspace but here only the input has the non-default address space. -;AVX512-LABEL: @foo2_addrspace2 -;AVX512: getelementptr inbounds %struct.In, %struct.In addrspace(1)* %in, <16 x i64> {{.*}}, i32 1 -;AVX512: llvm.masked.gather.v16f32.v16p1f32 -;AVX512: llvm.masked.scatter.v16f32.v16p0f32 -;AVX512: ret void define void @foo2_addrspace2(%struct.In addrspace(1)* noalias %in, float addrspace(0)* noalias %out, i32* noalias %trigger, i32* noalias %index) { +; AVX512-LABEL: @foo2_addrspace2( +; AVX512-NEXT: entry: +; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer +; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], [[STRUCT_IN]] addrspace(1)* [[IN:%.*]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef) +; AVX512-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7]], +; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float* [[OUT:%.*]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP3]], <16 x float*> [[TMP4]], i32 4, <16 x i1> [[TMP1]]) +; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_1:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP5]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP6:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_1]], zeroinitializer +; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP7]], i32 4, <16 x i1> [[TMP6]], <16 x float> undef) +; AVX512-NEXT: [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_1]], +; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP8]], <16 x float*> [[TMP9]], i32 4, <16 x i1> [[TMP6]]) +; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_2:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP10]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP11:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_2]], zeroinitializer +; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP12]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef) +; AVX512-NEXT: [[TMP13:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_2]], +; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP13]], <16 x float*> [[TMP14]], i32 4, <16 x i1> [[TMP11]]) +; AVX512-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_3:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP15]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP16:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_3]], zeroinitializer +; AVX512-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP17]], i32 4, <16 x i1> [[TMP16]], <16 x float> undef) +; AVX512-NEXT: [[TMP18:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_3]], +; AVX512-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP18]], <16 x float*> [[TMP19]], i32 4, <16 x i1> [[TMP16]]) +; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_4:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP20]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP21:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_4]], zeroinitializer +; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_4:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP22]], i32 4, <16 x i1> [[TMP21]], <16 x float> undef) +; AVX512-NEXT: [[TMP23:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_4]], +; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP23]], <16 x float*> [[TMP24]], i32 4, <16 x i1> [[TMP21]]) +; AVX512-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_5:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP25]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP26:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_5]], zeroinitializer +; AVX512-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP27]], i32 4, <16 x i1> [[TMP26]], <16 x float> undef) +; AVX512-NEXT: [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_5]], +; AVX512-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP28]], <16 x float*> [[TMP29]], i32 4, <16 x i1> [[TMP26]]) +; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_6:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP30]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP31:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_6]], zeroinitializer +; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP32]], i32 4, <16 x i1> [[TMP31]], <16 x float> undef) +; AVX512-NEXT: [[TMP33:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_6]], +; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP33]], <16 x float*> [[TMP34]], i32 4, <16 x i1> [[TMP31]]) +; AVX512-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_7:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP35]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP36:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_7]], zeroinitializer +; AVX512-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP37]], i32 4, <16 x i1> [[TMP36]], <16 x float> undef) +; AVX512-NEXT: [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_7]], +; AVX512-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP38]], <16 x float*> [[TMP39]], i32 4, <16 x i1> [[TMP36]]) +; AVX512-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_8:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP40]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP41:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_8]], zeroinitializer +; AVX512-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP42]], i32 4, <16 x i1> [[TMP41]], <16 x float> undef) +; AVX512-NEXT: [[TMP43:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_8]], +; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP43]], <16 x float*> [[TMP44]], i32 4, <16 x i1> [[TMP41]]) +; AVX512-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_9:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP45]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP46:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_9]], zeroinitializer +; AVX512-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_9:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP47]], i32 4, <16 x i1> [[TMP46]], <16 x float> undef) +; AVX512-NEXT: [[TMP48:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_9]], +; AVX512-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP48]], <16 x float*> [[TMP49]], i32 4, <16 x i1> [[TMP46]]) +; AVX512-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_10:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP50]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP51:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_10]], zeroinitializer +; AVX512-NEXT: [[TMP52:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_10:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP52]], i32 4, <16 x i1> [[TMP51]], <16 x float> undef) +; AVX512-NEXT: [[TMP53:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_10]], +; AVX512-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP53]], <16 x float*> [[TMP54]], i32 4, <16 x i1> [[TMP51]]) +; AVX512-NEXT: [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_11:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP55]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP56:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_11]], zeroinitializer +; AVX512-NEXT: [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_11:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP57]], i32 4, <16 x i1> [[TMP56]], <16 x float> undef) +; AVX512-NEXT: [[TMP58:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_11]], +; AVX512-NEXT: [[TMP59:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP58]], <16 x float*> [[TMP59]], i32 4, <16 x i1> [[TMP56]]) +; AVX512-NEXT: [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_12:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP60]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP61:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_12]], zeroinitializer +; AVX512-NEXT: [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_12:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP62]], i32 4, <16 x i1> [[TMP61]], <16 x float> undef) +; AVX512-NEXT: [[TMP63:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_12]], +; AVX512-NEXT: [[TMP64:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP63]], <16 x float*> [[TMP64]], i32 4, <16 x i1> [[TMP61]]) +; AVX512-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_13:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP65]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP66:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_13]], zeroinitializer +; AVX512-NEXT: [[TMP67:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_13:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP67]], i32 4, <16 x i1> [[TMP66]], <16 x float> undef) +; AVX512-NEXT: [[TMP68:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_13]], +; AVX512-NEXT: [[TMP69:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP68]], <16 x float*> [[TMP69]], i32 4, <16 x i1> [[TMP66]]) +; AVX512-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_14:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP70]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP71:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_14]], zeroinitializer +; AVX512-NEXT: [[TMP72:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_14:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP72]], i32 4, <16 x i1> [[TMP71]], <16 x float> undef) +; AVX512-NEXT: [[TMP73:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_14]], +; AVX512-NEXT: [[TMP74:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP73]], <16 x float*> [[TMP74]], i32 4, <16 x i1> [[TMP71]]) +; AVX512-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_15:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP75]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP76:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_15]], zeroinitializer +; AVX512-NEXT: [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_IN]], [[STRUCT_IN]] addrspace(1)* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_15:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p1f32(<16 x float addrspace(1)*> [[TMP77]], i32 4, <16 x i1> [[TMP76]], <16 x float> undef) +; AVX512-NEXT: [[TMP78:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_15]], +; AVX512-NEXT: [[TMP79:%.*]] = getelementptr inbounds float, float* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP78]], <16 x float*> [[TMP79]], i32 4, <16 x i1> [[TMP76]]) +; AVX512-NEXT: ret void +; entry: %in.addr = alloca %struct.In addrspace(1)*, align 8 %out.addr = alloca float addrspace(0)*, align 8 @@ -363,13 +931,139 @@ for.end: ; preds = %for.cond ; Same as foo2_addrspace but here only the output has the non-default address space. -;AVX512-LABEL: @foo2_addrspace3 -;AVX512: getelementptr inbounds %struct.In, %struct.In* %in, <16 x i64> {{.*}}, i32 1 -;AVX512: llvm.masked.gather.v16f32.v16p0f32 -;AVX512: llvm.masked.scatter.v16f32.v16p1f32 -;AVX512: ret void - define void @foo2_addrspace3(%struct.In addrspace(0)* noalias %in, float addrspace(1)* noalias %out, i32* noalias %trigger, i32* noalias %index) { +; AVX512-LABEL: @foo2_addrspace3( +; AVX512-NEXT: entry: +; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP0]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP1:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER]], zeroinitializer +; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_IN:%.*]], %struct.In* [[IN:%.*]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP2]], i32 4, <16 x i1> [[TMP1]], <16 x float> undef) +; AVX512-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7]], +; AVX512-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT:%.*]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP3]], <16 x float addrspace(1)*> [[TMP4]], i32 4, <16 x i1> [[TMP1]]) +; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_1:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP5]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP6:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_1]], zeroinitializer +; AVX512-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_1:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP7]], i32 4, <16 x i1> [[TMP6]], <16 x float> undef) +; AVX512-NEXT: [[TMP8:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_1]], +; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP8]], <16 x float addrspace(1)*> [[TMP9]], i32 4, <16 x i1> [[TMP6]]) +; AVX512-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_2:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP10]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP11:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_2]], zeroinitializer +; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_2:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP12]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef) +; AVX512-NEXT: [[TMP13:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_2]], +; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP13]], <16 x float addrspace(1)*> [[TMP14]], i32 4, <16 x i1> [[TMP11]]) +; AVX512-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_3:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP15]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP16:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_3]], zeroinitializer +; AVX512-NEXT: [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_3:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP17]], i32 4, <16 x i1> [[TMP16]], <16 x float> undef) +; AVX512-NEXT: [[TMP18:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_3]], +; AVX512-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP18]], <16 x float addrspace(1)*> [[TMP19]], i32 4, <16 x i1> [[TMP16]]) +; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_4:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP20]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP21:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_4]], zeroinitializer +; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_4:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP22]], i32 4, <16 x i1> [[TMP21]], <16 x float> undef) +; AVX512-NEXT: [[TMP23:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_4]], +; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP23]], <16 x float addrspace(1)*> [[TMP24]], i32 4, <16 x i1> [[TMP21]]) +; AVX512-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_5:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP25]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP26:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_5]], zeroinitializer +; AVX512-NEXT: [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_5:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP27]], i32 4, <16 x i1> [[TMP26]], <16 x float> undef) +; AVX512-NEXT: [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_5]], +; AVX512-NEXT: [[TMP29:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP28]], <16 x float addrspace(1)*> [[TMP29]], i32 4, <16 x i1> [[TMP26]]) +; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_6:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP30]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP31:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_6]], zeroinitializer +; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_6:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP32]], i32 4, <16 x i1> [[TMP31]], <16 x float> undef) +; AVX512-NEXT: [[TMP33:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_6]], +; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP33]], <16 x float addrspace(1)*> [[TMP34]], i32 4, <16 x i1> [[TMP31]]) +; AVX512-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_7:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP35]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP36:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_7]], zeroinitializer +; AVX512-NEXT: [[TMP37:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_7:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP37]], i32 4, <16 x i1> [[TMP36]], <16 x float> undef) +; AVX512-NEXT: [[TMP38:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_7]], +; AVX512-NEXT: [[TMP39:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP38]], <16 x float addrspace(1)*> [[TMP39]], i32 4, <16 x i1> [[TMP36]]) +; AVX512-NEXT: [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_8:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP40]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP41:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_8]], zeroinitializer +; AVX512-NEXT: [[TMP42:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_8:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP42]], i32 4, <16 x i1> [[TMP41]], <16 x float> undef) +; AVX512-NEXT: [[TMP43:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_8]], +; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP43]], <16 x float addrspace(1)*> [[TMP44]], i32 4, <16 x i1> [[TMP41]]) +; AVX512-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_9:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP45]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP46:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_9]], zeroinitializer +; AVX512-NEXT: [[TMP47:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_9:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP47]], i32 4, <16 x i1> [[TMP46]], <16 x float> undef) +; AVX512-NEXT: [[TMP48:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_9]], +; AVX512-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP48]], <16 x float addrspace(1)*> [[TMP49]], i32 4, <16 x i1> [[TMP46]]) +; AVX512-NEXT: [[TMP50:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_10:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP50]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP51:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_10]], zeroinitializer +; AVX512-NEXT: [[TMP52:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_10:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP52]], i32 4, <16 x i1> [[TMP51]], <16 x float> undef) +; AVX512-NEXT: [[TMP53:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_10]], +; AVX512-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP53]], <16 x float addrspace(1)*> [[TMP54]], i32 4, <16 x i1> [[TMP51]]) +; AVX512-NEXT: [[TMP55:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_11:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP55]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP56:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_11]], zeroinitializer +; AVX512-NEXT: [[TMP57:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_11:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP57]], i32 4, <16 x i1> [[TMP56]], <16 x float> undef) +; AVX512-NEXT: [[TMP58:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_11]], +; AVX512-NEXT: [[TMP59:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP58]], <16 x float addrspace(1)*> [[TMP59]], i32 4, <16 x i1> [[TMP56]]) +; AVX512-NEXT: [[TMP60:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_12:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP60]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP61:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_12]], zeroinitializer +; AVX512-NEXT: [[TMP62:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_12:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP62]], i32 4, <16 x i1> [[TMP61]], <16 x float> undef) +; AVX512-NEXT: [[TMP63:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_12]], +; AVX512-NEXT: [[TMP64:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP63]], <16 x float addrspace(1)*> [[TMP64]], i32 4, <16 x i1> [[TMP61]]) +; AVX512-NEXT: [[TMP65:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_13:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP65]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP66:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_13]], zeroinitializer +; AVX512-NEXT: [[TMP67:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_13:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP67]], i32 4, <16 x i1> [[TMP66]], <16 x float> undef) +; AVX512-NEXT: [[TMP68:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_13]], +; AVX512-NEXT: [[TMP69:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP68]], <16 x float addrspace(1)*> [[TMP69]], i32 4, <16 x i1> [[TMP66]]) +; AVX512-NEXT: [[TMP70:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_14:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP70]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP71:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_14]], zeroinitializer +; AVX512-NEXT: [[TMP72:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_14:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP72]], i32 4, <16 x i1> [[TMP71]], <16 x float> undef) +; AVX512-NEXT: [[TMP73:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_14]], +; AVX512-NEXT: [[TMP74:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP73]], <16 x float addrspace(1)*> [[TMP74]], i32 4, <16 x i1> [[TMP71]]) +; AVX512-NEXT: [[TMP75:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <16 x i64> +; AVX512-NEXT: [[WIDE_MASKED_GATHER_15:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> [[TMP75]], i32 4, <16 x i1> , <16 x i32> undef) +; AVX512-NEXT: [[TMP76:%.*]] = icmp sgt <16 x i32> [[WIDE_MASKED_GATHER_15]], zeroinitializer +; AVX512-NEXT: [[TMP77:%.*]] = getelementptr inbounds [[STRUCT_IN]], %struct.In* [[IN]], <16 x i64> , i32 1 +; AVX512-NEXT: [[WIDE_MASKED_GATHER7_15:%.*]] = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> [[TMP77]], i32 4, <16 x i1> [[TMP76]], <16 x float> undef) +; AVX512-NEXT: [[TMP78:%.*]] = fadd <16 x float> [[WIDE_MASKED_GATHER7_15]], +; AVX512-NEXT: [[TMP79:%.*]] = getelementptr inbounds float, float addrspace(1)* [[OUT]], <16 x i64> +; AVX512-NEXT: call void @llvm.masked.scatter.v16f32.v16p1f32(<16 x float> [[TMP78]], <16 x float addrspace(1)*> [[TMP79]], i32 4, <16 x i1> [[TMP76]]) +; AVX512-NEXT: ret void +; entry: %in.addr = alloca %struct.In addrspace(0)*, align 8 %out.addr = alloca float addrspace(1)*, align 8 diff --git a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll index 0377ae1..eb8c162 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -O3 -mcpu=corei7-avx -S | FileCheck %s -check-prefix=AVX -check-prefix=AVX1 ; RUN: opt < %s -O3 -mcpu=core-avx2 -S | FileCheck %s -check-prefix=AVX -check-prefix=AVX2 ; RUN: opt < %s -O3 -mcpu=knl -S | FileCheck %s -check-prefix=AVX512 @@ -16,22 +17,420 @@ target triple = "x86_64-pc_linux" ; } ;} -;AVX-LABEL: @foo1 -;AVX: icmp slt <8 x i32> %wide.load, @llvm.masked.load.v8i32.p0v8i32 -;AVX: add nsw <8 x i32> -;AVX: call void @llvm.masked.store.v8i32.p0v8i32 -;AVX: ret void - -;AVX512-LABEL: @foo1 -;AVX512: icmp slt <16 x i32> %wide.load, @llvm.masked.load.v16i32.p0v16i32 -;AVX512: add nsw <16 x i32> -;AVX512: call void @llvm.masked.store.v16i32.p0v16i32 -;AVX512: ret void - ; Function Attrs: nounwind uwtable define void @foo1(i32* %A, i32* %B, i32* %trigger) { +; AVX1-LABEL: @foo1( +; AVX1-NEXT: entry: +; AVX1-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 10000 +; AVX1-NEXT: [[SCEVGEP11:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 10000 +; AVX1-NEXT: [[SCEVGEP14:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 10000 +; AVX1-NEXT: [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP11]], [[A]] +; AVX1-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[TRIGGER]] +; AVX1-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; AVX1-NEXT: [[BOUND016:%.*]] = icmp ugt i32* [[SCEVGEP14]], [[A]] +; AVX1-NEXT: [[BOUND117:%.*]] = icmp ugt i32* [[SCEVGEP]], [[B]] +; AVX1-NEXT: [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]] +; AVX1-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]] +; AVX1-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]] +; AVX1: vector.body: +; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; AVX1-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]] +; AVX1-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <8 x i32>* +; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP1]], align 4, !alias.scope !0 +; AVX1-NEXT: [[TMP2:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], +; AVX1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] +; AVX1-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <8 x i32>* +; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP4]], i32 4, <8 x i1> [[TMP2]], <8 x i32> undef), !alias.scope !3 +; AVX1-NEXT: [[TMP5:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] +; AVX1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDEX]] +; AVX1-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>* +; AVX1-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP5]], <8 x i32>* [[TMP7]], i32 4, <8 x i1> [[TMP2]]), !alias.scope !5, !noalias !7 +; AVX1-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 8 +; AVX1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT]] +; AVX1-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>* +; AVX1-NEXT: [[WIDE_LOAD_1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4, !alias.scope !0 +; AVX1-NEXT: [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD_1]], +; AVX1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX_NEXT]] +; AVX1-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <8 x i32>* +; AVX1-NEXT: [[WIDE_MASKED_LOAD_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP12]], i32 4, <8 x i1> [[TMP10]], <8 x i32> undef), !alias.scope !3 +; AVX1-NEXT: [[TMP13:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD_1]], [[WIDE_LOAD_1]] +; AVX1-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDEX_NEXT]] +; AVX1-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <8 x i32>* +; AVX1-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP13]], <8 x i32>* [[TMP15]], i32 4, <8 x i1> [[TMP10]]), !alias.scope !5, !noalias !7 +; AVX1-NEXT: [[INDEX_NEXT_1]] = add nuw nsw i64 [[INDEX]], 16 +; AVX1-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT_1]], 10000 +; AVX1-NEXT: br i1 [[TMP16]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !8 +; AVX1: for.body: +; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_1:%.*]], [[FOR_INC_1:%.*]] ], [ 0, [[ENTRY]] ] +; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] +; AVX1-NEXT: [[TMP17:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; AVX1-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP17]], 100 +; AVX1-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]] +; AVX1: if.then: +; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] +; AVX1-NEXT: [[TMP18:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4 +; AVX1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP17]] +; AVX1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] +; AVX1-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX7]], align 4 +; AVX1-NEXT: br label [[FOR_INC]] +; AVX1: for.inc: +; AVX1-NEXT: [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1 +; AVX1-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]] +; AVX1-NEXT: [[TMP19:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 +; AVX1-NEXT: [[CMP1_1:%.*]] = icmp slt i32 [[TMP19]], 100 +; AVX1-NEXT: br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1]] +; AVX1: for.end: +; AVX1-NEXT: ret void +; AVX1: if.then.1: +; AVX1-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV_NEXT]] +; AVX1-NEXT: [[TMP20:%.*]] = load i32, i32* [[ARRAYIDX3_1]], align 4 +; AVX1-NEXT: [[ADD_1:%.*]] = add nsw i32 [[TMP20]], [[TMP19]] +; AVX1-NEXT: [[ARRAYIDX7_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT]] +; AVX1-NEXT: store i32 [[ADD_1]], i32* [[ARRAYIDX7_1]], align 4 +; AVX1-NEXT: br label [[FOR_INC_1]] +; AVX1: for.inc.1: +; AVX1-NEXT: [[INDVARS_IV_NEXT_1]] = add nuw nsw i64 [[INDVARS_IV]], 2 +; AVX1-NEXT: [[EXITCOND_1:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_1]], 10000 +; AVX1-NEXT: br i1 [[EXITCOND_1]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !10 +; +; AVX2-LABEL: @foo1( +; AVX2-NEXT: entry: +; AVX2-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 10000 +; AVX2-NEXT: [[SCEVGEP11:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 10000 +; AVX2-NEXT: [[SCEVGEP14:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 10000 +; AVX2-NEXT: [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP11]], [[A]] +; AVX2-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[TRIGGER]] +; AVX2-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; AVX2-NEXT: [[BOUND016:%.*]] = icmp ugt i32* [[SCEVGEP14]], [[A]] +; AVX2-NEXT: [[BOUND117:%.*]] = icmp ugt i32* [[SCEVGEP]], [[B]] +; AVX2-NEXT: [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]] +; AVX2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]] +; AVX2-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]] +; AVX2: vector.body: +; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; AVX2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]] +; AVX2-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <8 x i32>* +; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP1]], align 4, !alias.scope !0 +; AVX2-NEXT: [[TMP2:%.*]] = getelementptr i32, i32* [[TMP0]], i64 8 +; AVX2-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <8 x i32>* +; AVX2-NEXT: [[WIDE_LOAD22:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 4, !alias.scope !0 +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[TMP0]], i64 16 +; AVX2-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>* +; AVX2-NEXT: [[WIDE_LOAD23:%.*]] = load <8 x i32>, <8 x i32>* [[TMP5]], align 4, !alias.scope !0 +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP0]], i64 24 +; AVX2-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>* +; AVX2-NEXT: [[WIDE_LOAD24:%.*]] = load <8 x i32>, <8 x i32>* [[TMP7]], align 4, !alias.scope !0 +; AVX2-NEXT: [[TMP8:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], +; AVX2-NEXT: [[TMP9:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD22]], +; AVX2-NEXT: [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD23]], +; AVX2-NEXT: [[TMP11:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD24]], +; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] +; AVX2-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>* +; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP13]], i32 4, <8 x i1> [[TMP8]], <8 x i32> undef), !alias.scope !3 +; AVX2-NEXT: [[TMP14:%.*]] = getelementptr i32, i32* [[TMP12]], i64 8 +; AVX2-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <8 x i32>* +; AVX2-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP15]], i32 4, <8 x i1> [[TMP9]], <8 x i32> undef), !alias.scope !3 +; AVX2-NEXT: [[TMP16:%.*]] = getelementptr i32, i32* [[TMP12]], i64 16 +; AVX2-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <8 x i32>* +; AVX2-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP17]], i32 4, <8 x i1> [[TMP10]], <8 x i32> undef), !alias.scope !3 +; AVX2-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i64 24 +; AVX2-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <8 x i32>* +; AVX2-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP19]], i32 4, <8 x i1> [[TMP11]], <8 x i32> undef), !alias.scope !3 +; AVX2-NEXT: [[TMP20:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] +; AVX2-NEXT: [[TMP21:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD25]], [[WIDE_LOAD22]] +; AVX2-NEXT: [[TMP22:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD26]], [[WIDE_LOAD23]] +; AVX2-NEXT: [[TMP23:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD27]], [[WIDE_LOAD24]] +; AVX2-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDEX]] +; AVX2-NEXT: [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <8 x i32>* +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP20]], <8 x i32>* [[TMP25]], i32 4, <8 x i1> [[TMP8]]), !alias.scope !5, !noalias !7 +; AVX2-NEXT: [[TMP26:%.*]] = getelementptr i32, i32* [[TMP24]], i64 8 +; AVX2-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <8 x i32>* +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP21]], <8 x i32>* [[TMP27]], i32 4, <8 x i1> [[TMP9]]), !alias.scope !5, !noalias !7 +; AVX2-NEXT: [[TMP28:%.*]] = getelementptr i32, i32* [[TMP24]], i64 16 +; AVX2-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <8 x i32>* +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP22]], <8 x i32>* [[TMP29]], i32 4, <8 x i1> [[TMP10]]), !alias.scope !5, !noalias !7 +; AVX2-NEXT: [[TMP30:%.*]] = getelementptr i32, i32* [[TMP24]], i64 24 +; AVX2-NEXT: [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <8 x i32>* +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP23]], <8 x i32>* [[TMP31]], i32 4, <8 x i1> [[TMP11]]), !alias.scope !5, !noalias !7 +; AVX2-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 32 +; AVX2-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT]] +; AVX2-NEXT: [[TMP33:%.*]] = bitcast i32* [[TMP32]] to <8 x i32>* +; AVX2-NEXT: [[WIDE_LOAD_1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP33]], align 4, !alias.scope !0 +; AVX2-NEXT: [[TMP34:%.*]] = getelementptr i32, i32* [[TMP32]], i64 8 +; AVX2-NEXT: [[TMP35:%.*]] = bitcast i32* [[TMP34]] to <8 x i32>* +; AVX2-NEXT: [[WIDE_LOAD22_1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP35]], align 4, !alias.scope !0 +; AVX2-NEXT: [[TMP36:%.*]] = getelementptr i32, i32* [[TMP32]], i64 16 +; AVX2-NEXT: [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <8 x i32>* +; AVX2-NEXT: [[WIDE_LOAD23_1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP37]], align 4, !alias.scope !0 +; AVX2-NEXT: [[TMP38:%.*]] = getelementptr i32, i32* [[TMP32]], i64 24 +; AVX2-NEXT: [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <8 x i32>* +; AVX2-NEXT: [[WIDE_LOAD24_1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP39]], align 4, !alias.scope !0 +; AVX2-NEXT: [[TMP40:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD_1]], +; AVX2-NEXT: [[TMP41:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD22_1]], +; AVX2-NEXT: [[TMP42:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD23_1]], +; AVX2-NEXT: [[TMP43:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD24_1]], +; AVX2-NEXT: [[TMP44:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX_NEXT]] +; AVX2-NEXT: [[TMP45:%.*]] = bitcast i32* [[TMP44]] to <8 x i32>* +; AVX2-NEXT: [[WIDE_MASKED_LOAD_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP45]], i32 4, <8 x i1> [[TMP40]], <8 x i32> undef), !alias.scope !3 +; AVX2-NEXT: [[TMP46:%.*]] = getelementptr i32, i32* [[TMP44]], i64 8 +; AVX2-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <8 x i32>* +; AVX2-NEXT: [[WIDE_MASKED_LOAD25_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP47]], i32 4, <8 x i1> [[TMP41]], <8 x i32> undef), !alias.scope !3 +; AVX2-NEXT: [[TMP48:%.*]] = getelementptr i32, i32* [[TMP44]], i64 16 +; AVX2-NEXT: [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <8 x i32>* +; AVX2-NEXT: [[WIDE_MASKED_LOAD26_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP49]], i32 4, <8 x i1> [[TMP42]], <8 x i32> undef), !alias.scope !3 +; AVX2-NEXT: [[TMP50:%.*]] = getelementptr i32, i32* [[TMP44]], i64 24 +; AVX2-NEXT: [[TMP51:%.*]] = bitcast i32* [[TMP50]] to <8 x i32>* +; AVX2-NEXT: [[WIDE_MASKED_LOAD27_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP51]], i32 4, <8 x i1> [[TMP43]], <8 x i32> undef), !alias.scope !3 +; AVX2-NEXT: [[TMP52:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD_1]], [[WIDE_LOAD_1]] +; AVX2-NEXT: [[TMP53:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD25_1]], [[WIDE_LOAD22_1]] +; AVX2-NEXT: [[TMP54:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD26_1]], [[WIDE_LOAD23_1]] +; AVX2-NEXT: [[TMP55:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD27_1]], [[WIDE_LOAD24_1]] +; AVX2-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDEX_NEXT]] +; AVX2-NEXT: [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <8 x i32>* +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP52]], <8 x i32>* [[TMP57]], i32 4, <8 x i1> [[TMP40]]), !alias.scope !5, !noalias !7 +; AVX2-NEXT: [[TMP58:%.*]] = getelementptr i32, i32* [[TMP56]], i64 8 +; AVX2-NEXT: [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <8 x i32>* +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP53]], <8 x i32>* [[TMP59]], i32 4, <8 x i1> [[TMP41]]), !alias.scope !5, !noalias !7 +; AVX2-NEXT: [[TMP60:%.*]] = getelementptr i32, i32* [[TMP56]], i64 16 +; AVX2-NEXT: [[TMP61:%.*]] = bitcast i32* [[TMP60]] to <8 x i32>* +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP54]], <8 x i32>* [[TMP61]], i32 4, <8 x i1> [[TMP42]]), !alias.scope !5, !noalias !7 +; AVX2-NEXT: [[TMP62:%.*]] = getelementptr i32, i32* [[TMP56]], i64 24 +; AVX2-NEXT: [[TMP63:%.*]] = bitcast i32* [[TMP62]] to <8 x i32>* +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP55]], <8 x i32>* [[TMP63]], i32 4, <8 x i1> [[TMP43]]), !alias.scope !5, !noalias !7 +; AVX2-NEXT: [[INDEX_NEXT_1]] = add nuw nsw i64 [[INDEX]], 64 +; AVX2-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT_1]], 9984 +; AVX2-NEXT: br i1 [[TMP64]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !8 +; AVX2: for.body.preheader: +; AVX2-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ] +; AVX2-NEXT: br label [[FOR_BODY:%.*]] +; AVX2: for.body: +; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ] +; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] +; AVX2-NEXT: [[TMP65:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; AVX2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP65]], 100 +; AVX2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]] +; AVX2: if.then: +; AVX2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] +; AVX2-NEXT: [[TMP66:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4 +; AVX2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP66]], [[TMP65]] +; AVX2-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] +; AVX2-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX7]], align 4 +; AVX2-NEXT: br label [[FOR_INC]] +; AVX2: for.inc: +; AVX2-NEXT: [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1 +; AVX2-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]] +; AVX2-NEXT: [[TMP67:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 +; AVX2-NEXT: [[CMP1_1:%.*]] = icmp slt i32 [[TMP67]], 100 +; AVX2-NEXT: br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]] +; AVX2: for.end: +; AVX2-NEXT: ret void +; AVX2: if.then.1: +; AVX2-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV_NEXT]] +; AVX2-NEXT: [[TMP68:%.*]] = load i32, i32* [[ARRAYIDX3_1]], align 4 +; AVX2-NEXT: [[ADD_1:%.*]] = add nsw i32 [[TMP68]], [[TMP67]] +; AVX2-NEXT: [[ARRAYIDX7_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT]] +; AVX2-NEXT: store i32 [[ADD_1]], i32* [[ARRAYIDX7_1]], align 4 +; AVX2-NEXT: br label [[FOR_INC_1]] +; AVX2: for.inc.1: +; AVX2-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 2 +; AVX2-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]] +; AVX2-NEXT: [[TMP69:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4 +; AVX2-NEXT: [[CMP1_2:%.*]] = icmp slt i32 [[TMP69]], 100 +; AVX2-NEXT: br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]] +; AVX2: if.then.2: +; AVX2-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV_NEXT_1]] +; AVX2-NEXT: [[TMP70:%.*]] = load i32, i32* [[ARRAYIDX3_2]], align 4 +; AVX2-NEXT: [[ADD_2:%.*]] = add nsw i32 [[TMP70]], [[TMP69]] +; AVX2-NEXT: [[ARRAYIDX7_2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_1]] +; AVX2-NEXT: store i32 [[ADD_2]], i32* [[ARRAYIDX7_2]], align 4 +; AVX2-NEXT: br label [[FOR_INC_2]] +; AVX2: for.inc.2: +; AVX2-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 3 +; AVX2-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]] +; AVX2-NEXT: [[TMP71:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4 +; AVX2-NEXT: [[CMP1_3:%.*]] = icmp slt i32 [[TMP71]], 100 +; AVX2-NEXT: br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]] +; AVX2: if.then.3: +; AVX2-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV_NEXT_2]] +; AVX2-NEXT: [[TMP72:%.*]] = load i32, i32* [[ARRAYIDX3_3]], align 4 +; AVX2-NEXT: [[ADD_3:%.*]] = add nsw i32 [[TMP72]], [[TMP71]] +; AVX2-NEXT: [[ARRAYIDX7_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_2]] +; AVX2-NEXT: store i32 [[ADD_3]], i32* [[ARRAYIDX7_3]], align 4 +; AVX2-NEXT: br label [[FOR_INC_3]] +; AVX2: for.inc.3: +; AVX2-NEXT: [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 4 +; AVX2-NEXT: [[EXITCOND_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_3]], 10000 +; AVX2-NEXT: br i1 [[EXITCOND_3]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !10 +; +; AVX512-LABEL: @foo1( +; AVX512-NEXT: entry: +; AVX512-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 10000 +; AVX512-NEXT: [[SCEVGEP11:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 10000 +; AVX512-NEXT: [[SCEVGEP14:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 10000 +; AVX512-NEXT: [[BOUND0:%.*]] = icmp ugt i32* [[SCEVGEP11]], [[A]] +; AVX512-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[SCEVGEP]], [[TRIGGER]] +; AVX512-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; AVX512-NEXT: [[BOUND016:%.*]] = icmp ugt i32* [[SCEVGEP14]], [[A]] +; AVX512-NEXT: [[BOUND117:%.*]] = icmp ugt i32* [[SCEVGEP]], [[B]] +; AVX512-NEXT: [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]] +; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]] +; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]] +; AVX512: vector.body: +; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]] +; AVX512-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <16 x i32>* +; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP1]], align 4, !alias.scope !0 +; AVX512-NEXT: [[TMP2:%.*]] = getelementptr i32, i32* [[TMP0]], i64 16 +; AVX512-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>* +; AVX512-NEXT: [[WIDE_LOAD22:%.*]] = load <16 x i32>, <16 x i32>* [[TMP3]], align 4, !alias.scope !0 +; AVX512-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[TMP0]], i64 32 +; AVX512-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <16 x i32>* +; AVX512-NEXT: [[WIDE_LOAD23:%.*]] = load <16 x i32>, <16 x i32>* [[TMP5]], align 4, !alias.scope !0 +; AVX512-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP0]], i64 48 +; AVX512-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <16 x i32>* +; AVX512-NEXT: [[WIDE_LOAD24:%.*]] = load <16 x i32>, <16 x i32>* [[TMP7]], align 4, !alias.scope !0 +; AVX512-NEXT: [[TMP8:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], +; AVX512-NEXT: [[TMP9:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD22]], +; AVX512-NEXT: [[TMP10:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD23]], +; AVX512-NEXT: [[TMP11:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD24]], +; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX]] +; AVX512-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <16 x i32>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP13]], i32 4, <16 x i1> [[TMP8]], <16 x i32> undef), !alias.scope !3 +; AVX512-NEXT: [[TMP14:%.*]] = getelementptr i32, i32* [[TMP12]], i64 16 +; AVX512-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <16 x i32>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP15]], i32 4, <16 x i1> [[TMP9]], <16 x i32> undef), !alias.scope !3 +; AVX512-NEXT: [[TMP16:%.*]] = getelementptr i32, i32* [[TMP12]], i64 32 +; AVX512-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to <16 x i32>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP17]], i32 4, <16 x i1> [[TMP10]], <16 x i32> undef), !alias.scope !3 +; AVX512-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[TMP12]], i64 48 +; AVX512-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to <16 x i32>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP19]], i32 4, <16 x i1> [[TMP11]], <16 x i32> undef), !alias.scope !3 +; AVX512-NEXT: [[TMP20:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] +; AVX512-NEXT: [[TMP21:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD25]], [[WIDE_LOAD22]] +; AVX512-NEXT: [[TMP22:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD26]], [[WIDE_LOAD23]] +; AVX512-NEXT: [[TMP23:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD27]], [[WIDE_LOAD24]] +; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDEX]] +; AVX512-NEXT: [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <16 x i32>* +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP20]], <16 x i32>* [[TMP25]], i32 4, <16 x i1> [[TMP8]]), !alias.scope !5, !noalias !7 +; AVX512-NEXT: [[TMP26:%.*]] = getelementptr i32, i32* [[TMP24]], i64 16 +; AVX512-NEXT: [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <16 x i32>* +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP21]], <16 x i32>* [[TMP27]], i32 4, <16 x i1> [[TMP9]]), !alias.scope !5, !noalias !7 +; AVX512-NEXT: [[TMP28:%.*]] = getelementptr i32, i32* [[TMP24]], i64 32 +; AVX512-NEXT: [[TMP29:%.*]] = bitcast i32* [[TMP28]] to <16 x i32>* +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP22]], <16 x i32>* [[TMP29]], i32 4, <16 x i1> [[TMP10]]), !alias.scope !5, !noalias !7 +; AVX512-NEXT: [[TMP30:%.*]] = getelementptr i32, i32* [[TMP24]], i64 48 +; AVX512-NEXT: [[TMP31:%.*]] = bitcast i32* [[TMP30]] to <16 x i32>* +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP23]], <16 x i32>* [[TMP31]], i32 4, <16 x i1> [[TMP11]]), !alias.scope !5, !noalias !7 +; AVX512-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 64 +; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX_NEXT]] +; AVX512-NEXT: [[TMP33:%.*]] = bitcast i32* [[TMP32]] to <16 x i32>* +; AVX512-NEXT: [[WIDE_LOAD_1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP33]], align 4, !alias.scope !0 +; AVX512-NEXT: [[TMP34:%.*]] = getelementptr i32, i32* [[TMP32]], i64 16 +; AVX512-NEXT: [[TMP35:%.*]] = bitcast i32* [[TMP34]] to <16 x i32>* +; AVX512-NEXT: [[WIDE_LOAD22_1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP35]], align 4, !alias.scope !0 +; AVX512-NEXT: [[TMP36:%.*]] = getelementptr i32, i32* [[TMP32]], i64 32 +; AVX512-NEXT: [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <16 x i32>* +; AVX512-NEXT: [[WIDE_LOAD23_1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP37]], align 4, !alias.scope !0 +; AVX512-NEXT: [[TMP38:%.*]] = getelementptr i32, i32* [[TMP32]], i64 48 +; AVX512-NEXT: [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <16 x i32>* +; AVX512-NEXT: [[WIDE_LOAD24_1:%.*]] = load <16 x i32>, <16 x i32>* [[TMP39]], align 4, !alias.scope !0 +; AVX512-NEXT: [[TMP40:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD_1]], +; AVX512-NEXT: [[TMP41:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD22_1]], +; AVX512-NEXT: [[TMP42:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD23_1]], +; AVX512-NEXT: [[TMP43:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD24_1]], +; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDEX_NEXT]] +; AVX512-NEXT: [[TMP45:%.*]] = bitcast i32* [[TMP44]] to <16 x i32>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP45]], i32 4, <16 x i1> [[TMP40]], <16 x i32> undef), !alias.scope !3 +; AVX512-NEXT: [[TMP46:%.*]] = getelementptr i32, i32* [[TMP44]], i64 16 +; AVX512-NEXT: [[TMP47:%.*]] = bitcast i32* [[TMP46]] to <16 x i32>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD25_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP47]], i32 4, <16 x i1> [[TMP41]], <16 x i32> undef), !alias.scope !3 +; AVX512-NEXT: [[TMP48:%.*]] = getelementptr i32, i32* [[TMP44]], i64 32 +; AVX512-NEXT: [[TMP49:%.*]] = bitcast i32* [[TMP48]] to <16 x i32>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD26_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP49]], i32 4, <16 x i1> [[TMP42]], <16 x i32> undef), !alias.scope !3 +; AVX512-NEXT: [[TMP50:%.*]] = getelementptr i32, i32* [[TMP44]], i64 48 +; AVX512-NEXT: [[TMP51:%.*]] = bitcast i32* [[TMP50]] to <16 x i32>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD27_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>* [[TMP51]], i32 4, <16 x i1> [[TMP43]], <16 x i32> undef), !alias.scope !3 +; AVX512-NEXT: [[TMP52:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD_1]], [[WIDE_LOAD_1]] +; AVX512-NEXT: [[TMP53:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD25_1]], [[WIDE_LOAD22_1]] +; AVX512-NEXT: [[TMP54:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD26_1]], [[WIDE_LOAD23_1]] +; AVX512-NEXT: [[TMP55:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD27_1]], [[WIDE_LOAD24_1]] +; AVX512-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDEX_NEXT]] +; AVX512-NEXT: [[TMP57:%.*]] = bitcast i32* [[TMP56]] to <16 x i32>* +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP52]], <16 x i32>* [[TMP57]], i32 4, <16 x i1> [[TMP40]]), !alias.scope !5, !noalias !7 +; AVX512-NEXT: [[TMP58:%.*]] = getelementptr i32, i32* [[TMP56]], i64 16 +; AVX512-NEXT: [[TMP59:%.*]] = bitcast i32* [[TMP58]] to <16 x i32>* +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP53]], <16 x i32>* [[TMP59]], i32 4, <16 x i1> [[TMP41]]), !alias.scope !5, !noalias !7 +; AVX512-NEXT: [[TMP60:%.*]] = getelementptr i32, i32* [[TMP56]], i64 32 +; AVX512-NEXT: [[TMP61:%.*]] = bitcast i32* [[TMP60]] to <16 x i32>* +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP54]], <16 x i32>* [[TMP61]], i32 4, <16 x i1> [[TMP42]]), !alias.scope !5, !noalias !7 +; AVX512-NEXT: [[TMP62:%.*]] = getelementptr i32, i32* [[TMP56]], i64 48 +; AVX512-NEXT: [[TMP63:%.*]] = bitcast i32* [[TMP62]] to <16 x i32>* +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32> [[TMP55]], <16 x i32>* [[TMP63]], i32 4, <16 x i1> [[TMP43]]), !alias.scope !5, !noalias !7 +; AVX512-NEXT: [[INDEX_NEXT_1]] = add nuw nsw i64 [[INDEX]], 128 +; AVX512-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT_1]], 9984 +; AVX512-NEXT: br i1 [[TMP64]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !8 +; AVX512: for.body.preheader: +; AVX512-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ] +; AVX512-NEXT: br label [[FOR_BODY:%.*]] +; AVX512: for.body: +; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ] +; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] +; AVX512-NEXT: [[TMP65:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; AVX512-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP65]], 100 +; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]] +; AVX512: if.then: +; AVX512-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] +; AVX512-NEXT: [[TMP66:%.*]] = load i32, i32* [[ARRAYIDX3]], align 4 +; AVX512-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP66]], [[TMP65]] +; AVX512-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] +; AVX512-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX7]], align 4 +; AVX512-NEXT: br label [[FOR_INC]] +; AVX512: for.inc: +; AVX512-NEXT: [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1 +; AVX512-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]] +; AVX512-NEXT: [[TMP67:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 +; AVX512-NEXT: [[CMP1_1:%.*]] = icmp slt i32 [[TMP67]], 100 +; AVX512-NEXT: br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]] +; AVX512: for.end: +; AVX512-NEXT: ret void +; AVX512: if.then.1: +; AVX512-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV_NEXT]] +; AVX512-NEXT: [[TMP68:%.*]] = load i32, i32* [[ARRAYIDX3_1]], align 4 +; AVX512-NEXT: [[ADD_1:%.*]] = add nsw i32 [[TMP68]], [[TMP67]] +; AVX512-NEXT: [[ARRAYIDX7_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT]] +; AVX512-NEXT: store i32 [[ADD_1]], i32* [[ARRAYIDX7_1]], align 4 +; AVX512-NEXT: br label [[FOR_INC_1]] +; AVX512: for.inc.1: +; AVX512-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 2 +; AVX512-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]] +; AVX512-NEXT: [[TMP69:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4 +; AVX512-NEXT: [[CMP1_2:%.*]] = icmp slt i32 [[TMP69]], 100 +; AVX512-NEXT: br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]] +; AVX512: if.then.2: +; AVX512-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV_NEXT_1]] +; AVX512-NEXT: [[TMP70:%.*]] = load i32, i32* [[ARRAYIDX3_2]], align 4 +; AVX512-NEXT: [[ADD_2:%.*]] = add nsw i32 [[TMP70]], [[TMP69]] +; AVX512-NEXT: [[ARRAYIDX7_2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_1]] +; AVX512-NEXT: store i32 [[ADD_2]], i32* [[ARRAYIDX7_2]], align 4 +; AVX512-NEXT: br label [[FOR_INC_2]] +; AVX512: for.inc.2: +; AVX512-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 3 +; AVX512-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]] +; AVX512-NEXT: [[TMP71:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4 +; AVX512-NEXT: [[CMP1_3:%.*]] = icmp slt i32 [[TMP71]], 100 +; AVX512-NEXT: br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]] +; AVX512: if.then.3: +; AVX512-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV_NEXT_2]] +; AVX512-NEXT: [[TMP72:%.*]] = load i32, i32* [[ARRAYIDX3_3]], align 4 +; AVX512-NEXT: [[ADD_3:%.*]] = add nsw i32 [[TMP72]], [[TMP71]] +; AVX512-NEXT: [[ARRAYIDX7_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_2]] +; AVX512-NEXT: store i32 [[ADD_3]], i32* [[ARRAYIDX7_3]], align 4 +; AVX512-NEXT: br label [[FOR_INC_3]] +; AVX512: for.inc.3: +; AVX512-NEXT: [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 4 +; AVX512-NEXT: [[EXITCOND_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_3]], 10000 +; AVX512-NEXT: br i1 [[EXITCOND_3]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !10 +; entry: %A.addr = alloca i32*, align 8 %B.addr = alloca i32*, align 8 @@ -91,22 +490,420 @@ for.end: ; preds = %for.cond ; The same as @foo1 but all the pointers are address space 1 pointers. -;AVX-LABEL: @foo1_addrspace1 -;AVX: icmp slt <8 x i32> %wide.load, @llvm.masked.load.v8i32.p1v8i32 -;AVX: add nsw <8 x i32> -;AVX: call void @llvm.masked.store.v8i32.p1v8i32 -;AVX: ret void - -;AVX512-LABEL: @foo1_addrspace1 -;AVX512: icmp slt <16 x i32> %wide.load, @llvm.masked.load.v16i32.p1v16i32 -;AVX512: add nsw <16 x i32> -;AVX512: call void @llvm.masked.store.v16i32.p1v16i32 -;AVX512: ret void - ; Function Attrs: nounwind uwtable define void @foo1_addrspace1(i32 addrspace(1)* %A, i32 addrspace(1)* %B, i32 addrspace(1)* %trigger) { +; AVX1-LABEL: @foo1_addrspace1( +; AVX1-NEXT: entry: +; AVX1-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32 addrspace(1)* [[A:%.*]], i64 10000 +; AVX1-NEXT: [[SCEVGEP11:%.*]] = getelementptr i32, i32 addrspace(1)* [[TRIGGER:%.*]], i64 10000 +; AVX1-NEXT: [[SCEVGEP14:%.*]] = getelementptr i32, i32 addrspace(1)* [[B:%.*]], i64 10000 +; AVX1-NEXT: [[BOUND0:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP11]], [[A]] +; AVX1-NEXT: [[BOUND1:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP]], [[TRIGGER]] +; AVX1-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; AVX1-NEXT: [[BOUND016:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP14]], [[A]] +; AVX1-NEXT: [[BOUND117:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP]], [[B]] +; AVX1-NEXT: [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]] +; AVX1-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]] +; AVX1-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]] +; AVX1: vector.body: +; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; AVX1-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDEX]] +; AVX1-NEXT: [[TMP1:%.*]] = bitcast i32 addrspace(1)* [[TMP0]] to <8 x i32> addrspace(1)* +; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP1]], align 4, !alias.scope !11 +; AVX1-NEXT: [[TMP2:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], +; AVX1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDEX]] +; AVX1-NEXT: [[TMP4:%.*]] = bitcast i32 addrspace(1)* [[TMP3]] to <8 x i32> addrspace(1)* +; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP4]], i32 4, <8 x i1> [[TMP2]], <8 x i32> undef), !alias.scope !14 +; AVX1-NEXT: [[TMP5:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] +; AVX1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDEX]] +; AVX1-NEXT: [[TMP7:%.*]] = bitcast i32 addrspace(1)* [[TMP6]] to <8 x i32> addrspace(1)* +; AVX1-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP5]], <8 x i32> addrspace(1)* [[TMP7]], i32 4, <8 x i1> [[TMP2]]), !alias.scope !16, !noalias !18 +; AVX1-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 8 +; AVX1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDEX_NEXT]] +; AVX1-NEXT: [[TMP9:%.*]] = bitcast i32 addrspace(1)* [[TMP8]] to <8 x i32> addrspace(1)* +; AVX1-NEXT: [[WIDE_LOAD_1:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP9]], align 4, !alias.scope !11 +; AVX1-NEXT: [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD_1]], +; AVX1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDEX_NEXT]] +; AVX1-NEXT: [[TMP12:%.*]] = bitcast i32 addrspace(1)* [[TMP11]] to <8 x i32> addrspace(1)* +; AVX1-NEXT: [[WIDE_MASKED_LOAD_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP12]], i32 4, <8 x i1> [[TMP10]], <8 x i32> undef), !alias.scope !14 +; AVX1-NEXT: [[TMP13:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD_1]], [[WIDE_LOAD_1]] +; AVX1-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDEX_NEXT]] +; AVX1-NEXT: [[TMP15:%.*]] = bitcast i32 addrspace(1)* [[TMP14]] to <8 x i32> addrspace(1)* +; AVX1-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP13]], <8 x i32> addrspace(1)* [[TMP15]], i32 4, <8 x i1> [[TMP10]]), !alias.scope !16, !noalias !18 +; AVX1-NEXT: [[INDEX_NEXT_1]] = add nuw nsw i64 [[INDEX]], 16 +; AVX1-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT_1]], 10000 +; AVX1-NEXT: br i1 [[TMP16]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !19 +; AVX1: for.body: +; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_1:%.*]], [[FOR_INC_1:%.*]] ], [ 0, [[ENTRY]] ] +; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV]] +; AVX1-NEXT: [[TMP17:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4 +; AVX1-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP17]], 100 +; AVX1-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]] +; AVX1: if.then: +; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV]] +; AVX1-NEXT: [[TMP18:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3]], align 4 +; AVX1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP17]] +; AVX1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV]] +; AVX1-NEXT: store i32 [[ADD]], i32 addrspace(1)* [[ARRAYIDX7]], align 4 +; AVX1-NEXT: br label [[FOR_INC]] +; AVX1: for.inc: +; AVX1-NEXT: [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1 +; AVX1-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]] +; AVX1-NEXT: [[TMP19:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX_1]], align 4 +; AVX1-NEXT: [[CMP1_1:%.*]] = icmp slt i32 [[TMP19]], 100 +; AVX1-NEXT: br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1]] +; AVX1: for.end: +; AVX1-NEXT: ret void +; AVX1: if.then.1: +; AVX1-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV_NEXT]] +; AVX1-NEXT: [[TMP20:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3_1]], align 4 +; AVX1-NEXT: [[ADD_1:%.*]] = add nsw i32 [[TMP20]], [[TMP19]] +; AVX1-NEXT: [[ARRAYIDX7_1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV_NEXT]] +; AVX1-NEXT: store i32 [[ADD_1]], i32 addrspace(1)* [[ARRAYIDX7_1]], align 4 +; AVX1-NEXT: br label [[FOR_INC_1]] +; AVX1: for.inc.1: +; AVX1-NEXT: [[INDVARS_IV_NEXT_1]] = add nuw nsw i64 [[INDVARS_IV]], 2 +; AVX1-NEXT: [[EXITCOND_1:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_1]], 10000 +; AVX1-NEXT: br i1 [[EXITCOND_1]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !20 +; +; AVX2-LABEL: @foo1_addrspace1( +; AVX2-NEXT: entry: +; AVX2-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32 addrspace(1)* [[A:%.*]], i64 10000 +; AVX2-NEXT: [[SCEVGEP11:%.*]] = getelementptr i32, i32 addrspace(1)* [[TRIGGER:%.*]], i64 10000 +; AVX2-NEXT: [[SCEVGEP14:%.*]] = getelementptr i32, i32 addrspace(1)* [[B:%.*]], i64 10000 +; AVX2-NEXT: [[BOUND0:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP11]], [[A]] +; AVX2-NEXT: [[BOUND1:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP]], [[TRIGGER]] +; AVX2-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; AVX2-NEXT: [[BOUND016:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP14]], [[A]] +; AVX2-NEXT: [[BOUND117:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP]], [[B]] +; AVX2-NEXT: [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]] +; AVX2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]] +; AVX2-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]] +; AVX2: vector.body: +; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; AVX2-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDEX]] +; AVX2-NEXT: [[TMP1:%.*]] = bitcast i32 addrspace(1)* [[TMP0]] to <8 x i32> addrspace(1)* +; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP1]], align 4, !alias.scope !11 +; AVX2-NEXT: [[TMP2:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP0]], i64 8 +; AVX2-NEXT: [[TMP3:%.*]] = bitcast i32 addrspace(1)* [[TMP2]] to <8 x i32> addrspace(1)* +; AVX2-NEXT: [[WIDE_LOAD22:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP3]], align 4, !alias.scope !11 +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP0]], i64 16 +; AVX2-NEXT: [[TMP5:%.*]] = bitcast i32 addrspace(1)* [[TMP4]] to <8 x i32> addrspace(1)* +; AVX2-NEXT: [[WIDE_LOAD23:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP5]], align 4, !alias.scope !11 +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP0]], i64 24 +; AVX2-NEXT: [[TMP7:%.*]] = bitcast i32 addrspace(1)* [[TMP6]] to <8 x i32> addrspace(1)* +; AVX2-NEXT: [[WIDE_LOAD24:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP7]], align 4, !alias.scope !11 +; AVX2-NEXT: [[TMP8:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], +; AVX2-NEXT: [[TMP9:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD22]], +; AVX2-NEXT: [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD23]], +; AVX2-NEXT: [[TMP11:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD24]], +; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDEX]] +; AVX2-NEXT: [[TMP13:%.*]] = bitcast i32 addrspace(1)* [[TMP12]] to <8 x i32> addrspace(1)* +; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP13]], i32 4, <8 x i1> [[TMP8]], <8 x i32> undef), !alias.scope !14 +; AVX2-NEXT: [[TMP14:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP12]], i64 8 +; AVX2-NEXT: [[TMP15:%.*]] = bitcast i32 addrspace(1)* [[TMP14]] to <8 x i32> addrspace(1)* +; AVX2-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP15]], i32 4, <8 x i1> [[TMP9]], <8 x i32> undef), !alias.scope !14 +; AVX2-NEXT: [[TMP16:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP12]], i64 16 +; AVX2-NEXT: [[TMP17:%.*]] = bitcast i32 addrspace(1)* [[TMP16]] to <8 x i32> addrspace(1)* +; AVX2-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP17]], i32 4, <8 x i1> [[TMP10]], <8 x i32> undef), !alias.scope !14 +; AVX2-NEXT: [[TMP18:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP12]], i64 24 +; AVX2-NEXT: [[TMP19:%.*]] = bitcast i32 addrspace(1)* [[TMP18]] to <8 x i32> addrspace(1)* +; AVX2-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP19]], i32 4, <8 x i1> [[TMP11]], <8 x i32> undef), !alias.scope !14 +; AVX2-NEXT: [[TMP20:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] +; AVX2-NEXT: [[TMP21:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD25]], [[WIDE_LOAD22]] +; AVX2-NEXT: [[TMP22:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD26]], [[WIDE_LOAD23]] +; AVX2-NEXT: [[TMP23:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD27]], [[WIDE_LOAD24]] +; AVX2-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDEX]] +; AVX2-NEXT: [[TMP25:%.*]] = bitcast i32 addrspace(1)* [[TMP24]] to <8 x i32> addrspace(1)* +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP20]], <8 x i32> addrspace(1)* [[TMP25]], i32 4, <8 x i1> [[TMP8]]), !alias.scope !16, !noalias !18 +; AVX2-NEXT: [[TMP26:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP24]], i64 8 +; AVX2-NEXT: [[TMP27:%.*]] = bitcast i32 addrspace(1)* [[TMP26]] to <8 x i32> addrspace(1)* +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP21]], <8 x i32> addrspace(1)* [[TMP27]], i32 4, <8 x i1> [[TMP9]]), !alias.scope !16, !noalias !18 +; AVX2-NEXT: [[TMP28:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP24]], i64 16 +; AVX2-NEXT: [[TMP29:%.*]] = bitcast i32 addrspace(1)* [[TMP28]] to <8 x i32> addrspace(1)* +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP22]], <8 x i32> addrspace(1)* [[TMP29]], i32 4, <8 x i1> [[TMP10]]), !alias.scope !16, !noalias !18 +; AVX2-NEXT: [[TMP30:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP24]], i64 24 +; AVX2-NEXT: [[TMP31:%.*]] = bitcast i32 addrspace(1)* [[TMP30]] to <8 x i32> addrspace(1)* +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP23]], <8 x i32> addrspace(1)* [[TMP31]], i32 4, <8 x i1> [[TMP11]]), !alias.scope !16, !noalias !18 +; AVX2-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 32 +; AVX2-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDEX_NEXT]] +; AVX2-NEXT: [[TMP33:%.*]] = bitcast i32 addrspace(1)* [[TMP32]] to <8 x i32> addrspace(1)* +; AVX2-NEXT: [[WIDE_LOAD_1:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP33]], align 4, !alias.scope !11 +; AVX2-NEXT: [[TMP34:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP32]], i64 8 +; AVX2-NEXT: [[TMP35:%.*]] = bitcast i32 addrspace(1)* [[TMP34]] to <8 x i32> addrspace(1)* +; AVX2-NEXT: [[WIDE_LOAD22_1:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP35]], align 4, !alias.scope !11 +; AVX2-NEXT: [[TMP36:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP32]], i64 16 +; AVX2-NEXT: [[TMP37:%.*]] = bitcast i32 addrspace(1)* [[TMP36]] to <8 x i32> addrspace(1)* +; AVX2-NEXT: [[WIDE_LOAD23_1:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP37]], align 4, !alias.scope !11 +; AVX2-NEXT: [[TMP38:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP32]], i64 24 +; AVX2-NEXT: [[TMP39:%.*]] = bitcast i32 addrspace(1)* [[TMP38]] to <8 x i32> addrspace(1)* +; AVX2-NEXT: [[WIDE_LOAD24_1:%.*]] = load <8 x i32>, <8 x i32> addrspace(1)* [[TMP39]], align 4, !alias.scope !11 +; AVX2-NEXT: [[TMP40:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD_1]], +; AVX2-NEXT: [[TMP41:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD22_1]], +; AVX2-NEXT: [[TMP42:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD23_1]], +; AVX2-NEXT: [[TMP43:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD24_1]], +; AVX2-NEXT: [[TMP44:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDEX_NEXT]] +; AVX2-NEXT: [[TMP45:%.*]] = bitcast i32 addrspace(1)* [[TMP44]] to <8 x i32> addrspace(1)* +; AVX2-NEXT: [[WIDE_MASKED_LOAD_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP45]], i32 4, <8 x i1> [[TMP40]], <8 x i32> undef), !alias.scope !14 +; AVX2-NEXT: [[TMP46:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP44]], i64 8 +; AVX2-NEXT: [[TMP47:%.*]] = bitcast i32 addrspace(1)* [[TMP46]] to <8 x i32> addrspace(1)* +; AVX2-NEXT: [[WIDE_MASKED_LOAD25_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP47]], i32 4, <8 x i1> [[TMP41]], <8 x i32> undef), !alias.scope !14 +; AVX2-NEXT: [[TMP48:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP44]], i64 16 +; AVX2-NEXT: [[TMP49:%.*]] = bitcast i32 addrspace(1)* [[TMP48]] to <8 x i32> addrspace(1)* +; AVX2-NEXT: [[WIDE_MASKED_LOAD26_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP49]], i32 4, <8 x i1> [[TMP42]], <8 x i32> undef), !alias.scope !14 +; AVX2-NEXT: [[TMP50:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP44]], i64 24 +; AVX2-NEXT: [[TMP51:%.*]] = bitcast i32 addrspace(1)* [[TMP50]] to <8 x i32> addrspace(1)* +; AVX2-NEXT: [[WIDE_MASKED_LOAD27_1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p1v8i32(<8 x i32> addrspace(1)* [[TMP51]], i32 4, <8 x i1> [[TMP43]], <8 x i32> undef), !alias.scope !14 +; AVX2-NEXT: [[TMP52:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD_1]], [[WIDE_LOAD_1]] +; AVX2-NEXT: [[TMP53:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD25_1]], [[WIDE_LOAD22_1]] +; AVX2-NEXT: [[TMP54:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD26_1]], [[WIDE_LOAD23_1]] +; AVX2-NEXT: [[TMP55:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD27_1]], [[WIDE_LOAD24_1]] +; AVX2-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDEX_NEXT]] +; AVX2-NEXT: [[TMP57:%.*]] = bitcast i32 addrspace(1)* [[TMP56]] to <8 x i32> addrspace(1)* +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP52]], <8 x i32> addrspace(1)* [[TMP57]], i32 4, <8 x i1> [[TMP40]]), !alias.scope !16, !noalias !18 +; AVX2-NEXT: [[TMP58:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP56]], i64 8 +; AVX2-NEXT: [[TMP59:%.*]] = bitcast i32 addrspace(1)* [[TMP58]] to <8 x i32> addrspace(1)* +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP53]], <8 x i32> addrspace(1)* [[TMP59]], i32 4, <8 x i1> [[TMP41]]), !alias.scope !16, !noalias !18 +; AVX2-NEXT: [[TMP60:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP56]], i64 16 +; AVX2-NEXT: [[TMP61:%.*]] = bitcast i32 addrspace(1)* [[TMP60]] to <8 x i32> addrspace(1)* +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP54]], <8 x i32> addrspace(1)* [[TMP61]], i32 4, <8 x i1> [[TMP42]]), !alias.scope !16, !noalias !18 +; AVX2-NEXT: [[TMP62:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP56]], i64 24 +; AVX2-NEXT: [[TMP63:%.*]] = bitcast i32 addrspace(1)* [[TMP62]] to <8 x i32> addrspace(1)* +; AVX2-NEXT: call void @llvm.masked.store.v8i32.p1v8i32(<8 x i32> [[TMP55]], <8 x i32> addrspace(1)* [[TMP63]], i32 4, <8 x i1> [[TMP43]]), !alias.scope !16, !noalias !18 +; AVX2-NEXT: [[INDEX_NEXT_1]] = add nuw nsw i64 [[INDEX]], 64 +; AVX2-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT_1]], 9984 +; AVX2-NEXT: br i1 [[TMP64]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !19 +; AVX2: for.body.preheader: +; AVX2-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ] +; AVX2-NEXT: br label [[FOR_BODY:%.*]] +; AVX2: for.body: +; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ] +; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV]] +; AVX2-NEXT: [[TMP65:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4 +; AVX2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP65]], 100 +; AVX2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]] +; AVX2: if.then: +; AVX2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV]] +; AVX2-NEXT: [[TMP66:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3]], align 4 +; AVX2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP66]], [[TMP65]] +; AVX2-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV]] +; AVX2-NEXT: store i32 [[ADD]], i32 addrspace(1)* [[ARRAYIDX7]], align 4 +; AVX2-NEXT: br label [[FOR_INC]] +; AVX2: for.inc: +; AVX2-NEXT: [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1 +; AVX2-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]] +; AVX2-NEXT: [[TMP67:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX_1]], align 4 +; AVX2-NEXT: [[CMP1_1:%.*]] = icmp slt i32 [[TMP67]], 100 +; AVX2-NEXT: br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]] +; AVX2: for.end: +; AVX2-NEXT: ret void +; AVX2: if.then.1: +; AVX2-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV_NEXT]] +; AVX2-NEXT: [[TMP68:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3_1]], align 4 +; AVX2-NEXT: [[ADD_1:%.*]] = add nsw i32 [[TMP68]], [[TMP67]] +; AVX2-NEXT: [[ARRAYIDX7_1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV_NEXT]] +; AVX2-NEXT: store i32 [[ADD_1]], i32 addrspace(1)* [[ARRAYIDX7_1]], align 4 +; AVX2-NEXT: br label [[FOR_INC_1]] +; AVX2: for.inc.1: +; AVX2-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 2 +; AVX2-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]] +; AVX2-NEXT: [[TMP69:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX_2]], align 4 +; AVX2-NEXT: [[CMP1_2:%.*]] = icmp slt i32 [[TMP69]], 100 +; AVX2-NEXT: br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]] +; AVX2: if.then.2: +; AVX2-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV_NEXT_1]] +; AVX2-NEXT: [[TMP70:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3_2]], align 4 +; AVX2-NEXT: [[ADD_2:%.*]] = add nsw i32 [[TMP70]], [[TMP69]] +; AVX2-NEXT: [[ARRAYIDX7_2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV_NEXT_1]] +; AVX2-NEXT: store i32 [[ADD_2]], i32 addrspace(1)* [[ARRAYIDX7_2]], align 4 +; AVX2-NEXT: br label [[FOR_INC_2]] +; AVX2: for.inc.2: +; AVX2-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 3 +; AVX2-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]] +; AVX2-NEXT: [[TMP71:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX_3]], align 4 +; AVX2-NEXT: [[CMP1_3:%.*]] = icmp slt i32 [[TMP71]], 100 +; AVX2-NEXT: br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]] +; AVX2: if.then.3: +; AVX2-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV_NEXT_2]] +; AVX2-NEXT: [[TMP72:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3_3]], align 4 +; AVX2-NEXT: [[ADD_3:%.*]] = add nsw i32 [[TMP72]], [[TMP71]] +; AVX2-NEXT: [[ARRAYIDX7_3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV_NEXT_2]] +; AVX2-NEXT: store i32 [[ADD_3]], i32 addrspace(1)* [[ARRAYIDX7_3]], align 4 +; AVX2-NEXT: br label [[FOR_INC_3]] +; AVX2: for.inc.3: +; AVX2-NEXT: [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 4 +; AVX2-NEXT: [[EXITCOND_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_3]], 10000 +; AVX2-NEXT: br i1 [[EXITCOND_3]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !20 +; +; AVX512-LABEL: @foo1_addrspace1( +; AVX512-NEXT: entry: +; AVX512-NEXT: [[SCEVGEP:%.*]] = getelementptr i32, i32 addrspace(1)* [[A:%.*]], i64 10000 +; AVX512-NEXT: [[SCEVGEP11:%.*]] = getelementptr i32, i32 addrspace(1)* [[TRIGGER:%.*]], i64 10000 +; AVX512-NEXT: [[SCEVGEP14:%.*]] = getelementptr i32, i32 addrspace(1)* [[B:%.*]], i64 10000 +; AVX512-NEXT: [[BOUND0:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP11]], [[A]] +; AVX512-NEXT: [[BOUND1:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP]], [[TRIGGER]] +; AVX512-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; AVX512-NEXT: [[BOUND016:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP14]], [[A]] +; AVX512-NEXT: [[BOUND117:%.*]] = icmp ugt i32 addrspace(1)* [[SCEVGEP]], [[B]] +; AVX512-NEXT: [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]] +; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]] +; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]] +; AVX512: vector.body: +; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT_1:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDEX]] +; AVX512-NEXT: [[TMP1:%.*]] = bitcast i32 addrspace(1)* [[TMP0]] to <16 x i32> addrspace(1)* +; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP1]], align 4, !alias.scope !11 +; AVX512-NEXT: [[TMP2:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP0]], i64 16 +; AVX512-NEXT: [[TMP3:%.*]] = bitcast i32 addrspace(1)* [[TMP2]] to <16 x i32> addrspace(1)* +; AVX512-NEXT: [[WIDE_LOAD22:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP3]], align 4, !alias.scope !11 +; AVX512-NEXT: [[TMP4:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP0]], i64 32 +; AVX512-NEXT: [[TMP5:%.*]] = bitcast i32 addrspace(1)* [[TMP4]] to <16 x i32> addrspace(1)* +; AVX512-NEXT: [[WIDE_LOAD23:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP5]], align 4, !alias.scope !11 +; AVX512-NEXT: [[TMP6:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP0]], i64 48 +; AVX512-NEXT: [[TMP7:%.*]] = bitcast i32 addrspace(1)* [[TMP6]] to <16 x i32> addrspace(1)* +; AVX512-NEXT: [[WIDE_LOAD24:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP7]], align 4, !alias.scope !11 +; AVX512-NEXT: [[TMP8:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], +; AVX512-NEXT: [[TMP9:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD22]], +; AVX512-NEXT: [[TMP10:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD23]], +; AVX512-NEXT: [[TMP11:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD24]], +; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDEX]] +; AVX512-NEXT: [[TMP13:%.*]] = bitcast i32 addrspace(1)* [[TMP12]] to <16 x i32> addrspace(1)* +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP13]], i32 4, <16 x i1> [[TMP8]], <16 x i32> undef), !alias.scope !14 +; AVX512-NEXT: [[TMP14:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP12]], i64 16 +; AVX512-NEXT: [[TMP15:%.*]] = bitcast i32 addrspace(1)* [[TMP14]] to <16 x i32> addrspace(1)* +; AVX512-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP15]], i32 4, <16 x i1> [[TMP9]], <16 x i32> undef), !alias.scope !14 +; AVX512-NEXT: [[TMP16:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP12]], i64 32 +; AVX512-NEXT: [[TMP17:%.*]] = bitcast i32 addrspace(1)* [[TMP16]] to <16 x i32> addrspace(1)* +; AVX512-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP17]], i32 4, <16 x i1> [[TMP10]], <16 x i32> undef), !alias.scope !14 +; AVX512-NEXT: [[TMP18:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP12]], i64 48 +; AVX512-NEXT: [[TMP19:%.*]] = bitcast i32 addrspace(1)* [[TMP18]] to <16 x i32> addrspace(1)* +; AVX512-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP19]], i32 4, <16 x i1> [[TMP11]], <16 x i32> undef), !alias.scope !14 +; AVX512-NEXT: [[TMP20:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD]], [[WIDE_LOAD]] +; AVX512-NEXT: [[TMP21:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD25]], [[WIDE_LOAD22]] +; AVX512-NEXT: [[TMP22:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD26]], [[WIDE_LOAD23]] +; AVX512-NEXT: [[TMP23:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD27]], [[WIDE_LOAD24]] +; AVX512-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDEX]] +; AVX512-NEXT: [[TMP25:%.*]] = bitcast i32 addrspace(1)* [[TMP24]] to <16 x i32> addrspace(1)* +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP20]], <16 x i32> addrspace(1)* [[TMP25]], i32 4, <16 x i1> [[TMP8]]), !alias.scope !16, !noalias !18 +; AVX512-NEXT: [[TMP26:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP24]], i64 16 +; AVX512-NEXT: [[TMP27:%.*]] = bitcast i32 addrspace(1)* [[TMP26]] to <16 x i32> addrspace(1)* +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP21]], <16 x i32> addrspace(1)* [[TMP27]], i32 4, <16 x i1> [[TMP9]]), !alias.scope !16, !noalias !18 +; AVX512-NEXT: [[TMP28:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP24]], i64 32 +; AVX512-NEXT: [[TMP29:%.*]] = bitcast i32 addrspace(1)* [[TMP28]] to <16 x i32> addrspace(1)* +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP22]], <16 x i32> addrspace(1)* [[TMP29]], i32 4, <16 x i1> [[TMP10]]), !alias.scope !16, !noalias !18 +; AVX512-NEXT: [[TMP30:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP24]], i64 48 +; AVX512-NEXT: [[TMP31:%.*]] = bitcast i32 addrspace(1)* [[TMP30]] to <16 x i32> addrspace(1)* +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP23]], <16 x i32> addrspace(1)* [[TMP31]], i32 4, <16 x i1> [[TMP11]]), !alias.scope !16, !noalias !18 +; AVX512-NEXT: [[INDEX_NEXT:%.*]] = or i64 [[INDEX]], 64 +; AVX512-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDEX_NEXT]] +; AVX512-NEXT: [[TMP33:%.*]] = bitcast i32 addrspace(1)* [[TMP32]] to <16 x i32> addrspace(1)* +; AVX512-NEXT: [[WIDE_LOAD_1:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP33]], align 4, !alias.scope !11 +; AVX512-NEXT: [[TMP34:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP32]], i64 16 +; AVX512-NEXT: [[TMP35:%.*]] = bitcast i32 addrspace(1)* [[TMP34]] to <16 x i32> addrspace(1)* +; AVX512-NEXT: [[WIDE_LOAD22_1:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP35]], align 4, !alias.scope !11 +; AVX512-NEXT: [[TMP36:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP32]], i64 32 +; AVX512-NEXT: [[TMP37:%.*]] = bitcast i32 addrspace(1)* [[TMP36]] to <16 x i32> addrspace(1)* +; AVX512-NEXT: [[WIDE_LOAD23_1:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP37]], align 4, !alias.scope !11 +; AVX512-NEXT: [[TMP38:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP32]], i64 48 +; AVX512-NEXT: [[TMP39:%.*]] = bitcast i32 addrspace(1)* [[TMP38]] to <16 x i32> addrspace(1)* +; AVX512-NEXT: [[WIDE_LOAD24_1:%.*]] = load <16 x i32>, <16 x i32> addrspace(1)* [[TMP39]], align 4, !alias.scope !11 +; AVX512-NEXT: [[TMP40:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD_1]], +; AVX512-NEXT: [[TMP41:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD22_1]], +; AVX512-NEXT: [[TMP42:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD23_1]], +; AVX512-NEXT: [[TMP43:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD24_1]], +; AVX512-NEXT: [[TMP44:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDEX_NEXT]] +; AVX512-NEXT: [[TMP45:%.*]] = bitcast i32 addrspace(1)* [[TMP44]] to <16 x i32> addrspace(1)* +; AVX512-NEXT: [[WIDE_MASKED_LOAD_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP45]], i32 4, <16 x i1> [[TMP40]], <16 x i32> undef), !alias.scope !14 +; AVX512-NEXT: [[TMP46:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP44]], i64 16 +; AVX512-NEXT: [[TMP47:%.*]] = bitcast i32 addrspace(1)* [[TMP46]] to <16 x i32> addrspace(1)* +; AVX512-NEXT: [[WIDE_MASKED_LOAD25_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP47]], i32 4, <16 x i1> [[TMP41]], <16 x i32> undef), !alias.scope !14 +; AVX512-NEXT: [[TMP48:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP44]], i64 32 +; AVX512-NEXT: [[TMP49:%.*]] = bitcast i32 addrspace(1)* [[TMP48]] to <16 x i32> addrspace(1)* +; AVX512-NEXT: [[WIDE_MASKED_LOAD26_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP49]], i32 4, <16 x i1> [[TMP42]], <16 x i32> undef), !alias.scope !14 +; AVX512-NEXT: [[TMP50:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP44]], i64 48 +; AVX512-NEXT: [[TMP51:%.*]] = bitcast i32 addrspace(1)* [[TMP50]] to <16 x i32> addrspace(1)* +; AVX512-NEXT: [[WIDE_MASKED_LOAD27_1:%.*]] = call <16 x i32> @llvm.masked.load.v16i32.p1v16i32(<16 x i32> addrspace(1)* [[TMP51]], i32 4, <16 x i1> [[TMP43]], <16 x i32> undef), !alias.scope !14 +; AVX512-NEXT: [[TMP52:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD_1]], [[WIDE_LOAD_1]] +; AVX512-NEXT: [[TMP53:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD25_1]], [[WIDE_LOAD22_1]] +; AVX512-NEXT: [[TMP54:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD26_1]], [[WIDE_LOAD23_1]] +; AVX512-NEXT: [[TMP55:%.*]] = add nsw <16 x i32> [[WIDE_MASKED_LOAD27_1]], [[WIDE_LOAD24_1]] +; AVX512-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDEX_NEXT]] +; AVX512-NEXT: [[TMP57:%.*]] = bitcast i32 addrspace(1)* [[TMP56]] to <16 x i32> addrspace(1)* +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP52]], <16 x i32> addrspace(1)* [[TMP57]], i32 4, <16 x i1> [[TMP40]]), !alias.scope !16, !noalias !18 +; AVX512-NEXT: [[TMP58:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP56]], i64 16 +; AVX512-NEXT: [[TMP59:%.*]] = bitcast i32 addrspace(1)* [[TMP58]] to <16 x i32> addrspace(1)* +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP53]], <16 x i32> addrspace(1)* [[TMP59]], i32 4, <16 x i1> [[TMP41]]), !alias.scope !16, !noalias !18 +; AVX512-NEXT: [[TMP60:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP56]], i64 32 +; AVX512-NEXT: [[TMP61:%.*]] = bitcast i32 addrspace(1)* [[TMP60]] to <16 x i32> addrspace(1)* +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP54]], <16 x i32> addrspace(1)* [[TMP61]], i32 4, <16 x i1> [[TMP42]]), !alias.scope !16, !noalias !18 +; AVX512-NEXT: [[TMP62:%.*]] = getelementptr i32, i32 addrspace(1)* [[TMP56]], i64 48 +; AVX512-NEXT: [[TMP63:%.*]] = bitcast i32 addrspace(1)* [[TMP62]] to <16 x i32> addrspace(1)* +; AVX512-NEXT: call void @llvm.masked.store.v16i32.p1v16i32(<16 x i32> [[TMP55]], <16 x i32> addrspace(1)* [[TMP63]], i32 4, <16 x i1> [[TMP43]]), !alias.scope !16, !noalias !18 +; AVX512-NEXT: [[INDEX_NEXT_1]] = add nuw nsw i64 [[INDEX]], 128 +; AVX512-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT_1]], 9984 +; AVX512-NEXT: br i1 [[TMP64]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !19 +; AVX512: for.body.preheader: +; AVX512-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ] +; AVX512-NEXT: br label [[FOR_BODY:%.*]] +; AVX512: for.body: +; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ] +; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV]] +; AVX512-NEXT: [[TMP65:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX]], align 4 +; AVX512-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP65]], 100 +; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]] +; AVX512: if.then: +; AVX512-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV]] +; AVX512-NEXT: [[TMP66:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3]], align 4 +; AVX512-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP66]], [[TMP65]] +; AVX512-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV]] +; AVX512-NEXT: store i32 [[ADD]], i32 addrspace(1)* [[ARRAYIDX7]], align 4 +; AVX512-NEXT: br label [[FOR_INC]] +; AVX512: for.inc: +; AVX512-NEXT: [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1 +; AVX512-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]] +; AVX512-NEXT: [[TMP67:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX_1]], align 4 +; AVX512-NEXT: [[CMP1_1:%.*]] = icmp slt i32 [[TMP67]], 100 +; AVX512-NEXT: br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]] +; AVX512: for.end: +; AVX512-NEXT: ret void +; AVX512: if.then.1: +; AVX512-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV_NEXT]] +; AVX512-NEXT: [[TMP68:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3_1]], align 4 +; AVX512-NEXT: [[ADD_1:%.*]] = add nsw i32 [[TMP68]], [[TMP67]] +; AVX512-NEXT: [[ARRAYIDX7_1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV_NEXT]] +; AVX512-NEXT: store i32 [[ADD_1]], i32 addrspace(1)* [[ARRAYIDX7_1]], align 4 +; AVX512-NEXT: br label [[FOR_INC_1]] +; AVX512: for.inc.1: +; AVX512-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 2 +; AVX512-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]] +; AVX512-NEXT: [[TMP69:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX_2]], align 4 +; AVX512-NEXT: [[CMP1_2:%.*]] = icmp slt i32 [[TMP69]], 100 +; AVX512-NEXT: br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]] +; AVX512: if.then.2: +; AVX512-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV_NEXT_1]] +; AVX512-NEXT: [[TMP70:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3_2]], align 4 +; AVX512-NEXT: [[ADD_2:%.*]] = add nsw i32 [[TMP70]], [[TMP69]] +; AVX512-NEXT: [[ARRAYIDX7_2:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV_NEXT_1]] +; AVX512-NEXT: store i32 [[ADD_2]], i32 addrspace(1)* [[ARRAYIDX7_2]], align 4 +; AVX512-NEXT: br label [[FOR_INC_2]] +; AVX512: for.inc.2: +; AVX512-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 3 +; AVX512-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]] +; AVX512-NEXT: [[TMP71:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX_3]], align 4 +; AVX512-NEXT: [[CMP1_3:%.*]] = icmp slt i32 [[TMP71]], 100 +; AVX512-NEXT: br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]] +; AVX512: if.then.3: +; AVX512-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[B]], i64 [[INDVARS_IV_NEXT_2]] +; AVX512-NEXT: [[TMP72:%.*]] = load i32, i32 addrspace(1)* [[ARRAYIDX3_3]], align 4 +; AVX512-NEXT: [[ADD_3:%.*]] = add nsw i32 [[TMP72]], [[TMP71]] +; AVX512-NEXT: [[ARRAYIDX7_3:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[A]], i64 [[INDVARS_IV_NEXT_2]] +; AVX512-NEXT: store i32 [[ADD_3]], i32 addrspace(1)* [[ARRAYIDX7_3]], align 4 +; AVX512-NEXT: br label [[FOR_INC_3]] +; AVX512: for.inc.3: +; AVX512-NEXT: [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 4 +; AVX512-NEXT: [[EXITCOND_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_3]], 10000 +; AVX512-NEXT: br i1 [[EXITCOND_3]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !20 +; entry: %A.addr = alloca i32 addrspace(1)*, align 8 %B.addr = alloca i32 addrspace(1)*, align 8 @@ -175,22 +972,382 @@ for.end: ; preds = %for.cond ; } ;} -;AVX-LABEL: @foo2 -;AVX: icmp slt <8 x i32> %wide.load, @llvm.masked.load.v8f32.p0v8f32 -;AVX: fadd <8 x float> -;AVX: call void @llvm.masked.store.v8f32.p0v8f32 -;AVX: ret void - -;AVX512-LABEL: @foo2 -;AVX512: icmp slt <16 x i32> %wide.load, @llvm.masked.load.v16f32.p0v16f32 -;AVX512: fadd <16 x float> -;AVX512: call void @llvm.masked.store.v16f32.p0v16f32 -;AVX512: ret void - ; Function Attrs: nounwind uwtable define void @foo2(float* %A, float* %B, i32* %trigger) { +; AVX1-LABEL: @foo2( +; AVX1-NEXT: entry: +; AVX1-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[A:%.*]], i64 10000 +; AVX1-NEXT: [[SCEVGEP11:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 10000 +; AVX1-NEXT: [[SCEVGEP14:%.*]] = getelementptr float, float* [[B:%.*]], i64 10000 +; AVX1-NEXT: [[TMP0:%.*]] = bitcast i32* [[SCEVGEP11]] to float* +; AVX1-NEXT: [[BOUND0:%.*]] = icmp ugt float* [[TMP0]], [[A]] +; AVX1-NEXT: [[TMP1:%.*]] = bitcast float* [[SCEVGEP]] to i32* +; AVX1-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[TRIGGER]] +; AVX1-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; AVX1-NEXT: [[BOUND016:%.*]] = icmp ugt float* [[SCEVGEP14]], [[A]] +; AVX1-NEXT: [[BOUND117:%.*]] = icmp ugt float* [[SCEVGEP]], [[B]] +; AVX1-NEXT: [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]] +; AVX1-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]] +; AVX1-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]] +; AVX1: vector.body: +; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; AVX1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]] +; AVX1-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <8 x i32>* +; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 4, !alias.scope !21 +; AVX1-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[TMP2]], i64 8 +; AVX1-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>* +; AVX1-NEXT: [[WIDE_LOAD22:%.*]] = load <8 x i32>, <8 x i32>* [[TMP5]], align 4, !alias.scope !21 +; AVX1-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP2]], i64 16 +; AVX1-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>* +; AVX1-NEXT: [[WIDE_LOAD23:%.*]] = load <8 x i32>, <8 x i32>* [[TMP7]], align 4, !alias.scope !21 +; AVX1-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP2]], i64 24 +; AVX1-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>* +; AVX1-NEXT: [[WIDE_LOAD24:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4, !alias.scope !21 +; AVX1-NEXT: [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], +; AVX1-NEXT: [[TMP11:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD22]], +; AVX1-NEXT: [[TMP12:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD23]], +; AVX1-NEXT: [[TMP13:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD24]], +; AVX1-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDEX]] +; AVX1-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <8 x float>* +; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP15]], i32 4, <8 x i1> [[TMP10]], <8 x float> undef), !alias.scope !24 +; AVX1-NEXT: [[TMP16:%.*]] = getelementptr float, float* [[TMP14]], i64 8 +; AVX1-NEXT: [[TMP17:%.*]] = bitcast float* [[TMP16]] to <8 x float>* +; AVX1-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP17]], i32 4, <8 x i1> [[TMP11]], <8 x float> undef), !alias.scope !24 +; AVX1-NEXT: [[TMP18:%.*]] = getelementptr float, float* [[TMP14]], i64 16 +; AVX1-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP18]] to <8 x float>* +; AVX1-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP19]], i32 4, <8 x i1> [[TMP12]], <8 x float> undef), !alias.scope !24 +; AVX1-NEXT: [[TMP20:%.*]] = getelementptr float, float* [[TMP14]], i64 24 +; AVX1-NEXT: [[TMP21:%.*]] = bitcast float* [[TMP20]] to <8 x float>* +; AVX1-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP21]], i32 4, <8 x i1> [[TMP13]], <8 x float> undef), !alias.scope !24 +; AVX1-NEXT: [[TMP22:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float> +; AVX1-NEXT: [[TMP23:%.*]] = sitofp <8 x i32> [[WIDE_LOAD22]] to <8 x float> +; AVX1-NEXT: [[TMP24:%.*]] = sitofp <8 x i32> [[WIDE_LOAD23]] to <8 x float> +; AVX1-NEXT: [[TMP25:%.*]] = sitofp <8 x i32> [[WIDE_LOAD24]] to <8 x float> +; AVX1-NEXT: [[TMP26:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD]], [[TMP22]] +; AVX1-NEXT: [[TMP27:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD25]], [[TMP23]] +; AVX1-NEXT: [[TMP28:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD26]], [[TMP24]] +; AVX1-NEXT: [[TMP29:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD27]], [[TMP25]] +; AVX1-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDEX]] +; AVX1-NEXT: [[TMP31:%.*]] = bitcast float* [[TMP30]] to <8 x float>* +; AVX1-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP26]], <8 x float>* [[TMP31]], i32 4, <8 x i1> [[TMP10]]), !alias.scope !26, !noalias !28 +; AVX1-NEXT: [[TMP32:%.*]] = getelementptr float, float* [[TMP30]], i64 8 +; AVX1-NEXT: [[TMP33:%.*]] = bitcast float* [[TMP32]] to <8 x float>* +; AVX1-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP27]], <8 x float>* [[TMP33]], i32 4, <8 x i1> [[TMP11]]), !alias.scope !26, !noalias !28 +; AVX1-NEXT: [[TMP34:%.*]] = getelementptr float, float* [[TMP30]], i64 16 +; AVX1-NEXT: [[TMP35:%.*]] = bitcast float* [[TMP34]] to <8 x float>* +; AVX1-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP28]], <8 x float>* [[TMP35]], i32 4, <8 x i1> [[TMP12]]), !alias.scope !26, !noalias !28 +; AVX1-NEXT: [[TMP36:%.*]] = getelementptr float, float* [[TMP30]], i64 24 +; AVX1-NEXT: [[TMP37:%.*]] = bitcast float* [[TMP36]] to <8 x float>* +; AVX1-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP29]], <8 x float>* [[TMP37]], i32 4, <8 x i1> [[TMP13]]), !alias.scope !26, !noalias !28 +; AVX1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 +; AVX1-NEXT: [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 +; AVX1-NEXT: br i1 [[TMP38]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !29 +; AVX1: for.body.preheader: +; AVX1-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ] +; AVX1-NEXT: br label [[FOR_BODY:%.*]] +; AVX1: for.body: +; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_1:%.*]], [[FOR_INC_1:%.*]] ] +; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] +; AVX1-NEXT: [[TMP39:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; AVX1-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP39]], 100 +; AVX1-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]] +; AVX1: if.then: +; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]] +; AVX1-NEXT: [[TMP40:%.*]] = load float, float* [[ARRAYIDX3]], align 4 +; AVX1-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP39]] to float +; AVX1-NEXT: [[ADD:%.*]] = fadd float [[TMP40]], [[CONV]] +; AVX1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]] +; AVX1-NEXT: store float [[ADD]], float* [[ARRAYIDX7]], align 4 +; AVX1-NEXT: br label [[FOR_INC]] +; AVX1: for.inc: +; AVX1-NEXT: [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1 +; AVX1-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]] +; AVX1-NEXT: [[TMP41:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 +; AVX1-NEXT: [[CMP1_1:%.*]] = icmp slt i32 [[TMP41]], 100 +; AVX1-NEXT: br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1]] +; AVX1: for.end: +; AVX1-NEXT: ret void +; AVX1: if.then.1: +; AVX1-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV_NEXT]] +; AVX1-NEXT: [[TMP42:%.*]] = load float, float* [[ARRAYIDX3_1]], align 4 +; AVX1-NEXT: [[CONV_1:%.*]] = sitofp i32 [[TMP41]] to float +; AVX1-NEXT: [[ADD_1:%.*]] = fadd float [[TMP42]], [[CONV_1]] +; AVX1-NEXT: [[ARRAYIDX7_1:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT]] +; AVX1-NEXT: store float [[ADD_1]], float* [[ARRAYIDX7_1]], align 4 +; AVX1-NEXT: br label [[FOR_INC_1]] +; AVX1: for.inc.1: +; AVX1-NEXT: [[INDVARS_IV_NEXT_1]] = add nuw nsw i64 [[INDVARS_IV]], 2 +; AVX1-NEXT: [[EXITCOND_1:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_1]], 10000 +; AVX1-NEXT: br i1 [[EXITCOND_1]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !30 +; +; AVX2-LABEL: @foo2( +; AVX2-NEXT: entry: +; AVX2-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[A:%.*]], i64 10000 +; AVX2-NEXT: [[SCEVGEP11:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 10000 +; AVX2-NEXT: [[SCEVGEP14:%.*]] = getelementptr float, float* [[B:%.*]], i64 10000 +; AVX2-NEXT: [[TMP0:%.*]] = bitcast i32* [[SCEVGEP11]] to float* +; AVX2-NEXT: [[BOUND0:%.*]] = icmp ugt float* [[TMP0]], [[A]] +; AVX2-NEXT: [[TMP1:%.*]] = bitcast float* [[SCEVGEP]] to i32* +; AVX2-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[TRIGGER]] +; AVX2-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; AVX2-NEXT: [[BOUND016:%.*]] = icmp ugt float* [[SCEVGEP14]], [[A]] +; AVX2-NEXT: [[BOUND117:%.*]] = icmp ugt float* [[SCEVGEP]], [[B]] +; AVX2-NEXT: [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]] +; AVX2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]] +; AVX2-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]] +; AVX2: vector.body: +; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; AVX2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]] +; AVX2-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <8 x i32>* +; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 4, !alias.scope !21 +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[TMP2]], i64 8 +; AVX2-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>* +; AVX2-NEXT: [[WIDE_LOAD22:%.*]] = load <8 x i32>, <8 x i32>* [[TMP5]], align 4, !alias.scope !21 +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP2]], i64 16 +; AVX2-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>* +; AVX2-NEXT: [[WIDE_LOAD23:%.*]] = load <8 x i32>, <8 x i32>* [[TMP7]], align 4, !alias.scope !21 +; AVX2-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP2]], i64 24 +; AVX2-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>* +; AVX2-NEXT: [[WIDE_LOAD24:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4, !alias.scope !21 +; AVX2-NEXT: [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], +; AVX2-NEXT: [[TMP11:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD22]], +; AVX2-NEXT: [[TMP12:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD23]], +; AVX2-NEXT: [[TMP13:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD24]], +; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDEX]] +; AVX2-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <8 x float>* +; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP15]], i32 4, <8 x i1> [[TMP10]], <8 x float> undef), !alias.scope !24 +; AVX2-NEXT: [[TMP16:%.*]] = getelementptr float, float* [[TMP14]], i64 8 +; AVX2-NEXT: [[TMP17:%.*]] = bitcast float* [[TMP16]] to <8 x float>* +; AVX2-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP17]], i32 4, <8 x i1> [[TMP11]], <8 x float> undef), !alias.scope !24 +; AVX2-NEXT: [[TMP18:%.*]] = getelementptr float, float* [[TMP14]], i64 16 +; AVX2-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP18]] to <8 x float>* +; AVX2-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP19]], i32 4, <8 x i1> [[TMP12]], <8 x float> undef), !alias.scope !24 +; AVX2-NEXT: [[TMP20:%.*]] = getelementptr float, float* [[TMP14]], i64 24 +; AVX2-NEXT: [[TMP21:%.*]] = bitcast float* [[TMP20]] to <8 x float>* +; AVX2-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP21]], i32 4, <8 x i1> [[TMP13]], <8 x float> undef), !alias.scope !24 +; AVX2-NEXT: [[TMP22:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x float> +; AVX2-NEXT: [[TMP23:%.*]] = sitofp <8 x i32> [[WIDE_LOAD22]] to <8 x float> +; AVX2-NEXT: [[TMP24:%.*]] = sitofp <8 x i32> [[WIDE_LOAD23]] to <8 x float> +; AVX2-NEXT: [[TMP25:%.*]] = sitofp <8 x i32> [[WIDE_LOAD24]] to <8 x float> +; AVX2-NEXT: [[TMP26:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD]], [[TMP22]] +; AVX2-NEXT: [[TMP27:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD25]], [[TMP23]] +; AVX2-NEXT: [[TMP28:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD26]], [[TMP24]] +; AVX2-NEXT: [[TMP29:%.*]] = fadd <8 x float> [[WIDE_MASKED_LOAD27]], [[TMP25]] +; AVX2-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDEX]] +; AVX2-NEXT: [[TMP31:%.*]] = bitcast float* [[TMP30]] to <8 x float>* +; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP26]], <8 x float>* [[TMP31]], i32 4, <8 x i1> [[TMP10]]), !alias.scope !26, !noalias !28 +; AVX2-NEXT: [[TMP32:%.*]] = getelementptr float, float* [[TMP30]], i64 8 +; AVX2-NEXT: [[TMP33:%.*]] = bitcast float* [[TMP32]] to <8 x float>* +; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP27]], <8 x float>* [[TMP33]], i32 4, <8 x i1> [[TMP11]]), !alias.scope !26, !noalias !28 +; AVX2-NEXT: [[TMP34:%.*]] = getelementptr float, float* [[TMP30]], i64 16 +; AVX2-NEXT: [[TMP35:%.*]] = bitcast float* [[TMP34]] to <8 x float>* +; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP28]], <8 x float>* [[TMP35]], i32 4, <8 x i1> [[TMP12]]), !alias.scope !26, !noalias !28 +; AVX2-NEXT: [[TMP36:%.*]] = getelementptr float, float* [[TMP30]], i64 24 +; AVX2-NEXT: [[TMP37:%.*]] = bitcast float* [[TMP36]] to <8 x float>* +; AVX2-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP29]], <8 x float>* [[TMP37]], i32 4, <8 x i1> [[TMP13]]), !alias.scope !26, !noalias !28 +; AVX2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 +; AVX2-NEXT: [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 +; AVX2-NEXT: br i1 [[TMP38]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !29 +; AVX2: for.body.preheader: +; AVX2-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ] +; AVX2-NEXT: br label [[FOR_BODY:%.*]] +; AVX2: for.body: +; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ] +; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] +; AVX2-NEXT: [[TMP39:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; AVX2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP39]], 100 +; AVX2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]] +; AVX2: if.then: +; AVX2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]] +; AVX2-NEXT: [[TMP40:%.*]] = load float, float* [[ARRAYIDX3]], align 4 +; AVX2-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP39]] to float +; AVX2-NEXT: [[ADD:%.*]] = fadd float [[TMP40]], [[CONV]] +; AVX2-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]] +; AVX2-NEXT: store float [[ADD]], float* [[ARRAYIDX7]], align 4 +; AVX2-NEXT: br label [[FOR_INC]] +; AVX2: for.inc: +; AVX2-NEXT: [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1 +; AVX2-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]] +; AVX2-NEXT: [[TMP41:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 +; AVX2-NEXT: [[CMP1_1:%.*]] = icmp slt i32 [[TMP41]], 100 +; AVX2-NEXT: br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]] +; AVX2: for.end: +; AVX2-NEXT: ret void +; AVX2: if.then.1: +; AVX2-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV_NEXT]] +; AVX2-NEXT: [[TMP42:%.*]] = load float, float* [[ARRAYIDX3_1]], align 4 +; AVX2-NEXT: [[CONV_1:%.*]] = sitofp i32 [[TMP41]] to float +; AVX2-NEXT: [[ADD_1:%.*]] = fadd float [[TMP42]], [[CONV_1]] +; AVX2-NEXT: [[ARRAYIDX7_1:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT]] +; AVX2-NEXT: store float [[ADD_1]], float* [[ARRAYIDX7_1]], align 4 +; AVX2-NEXT: br label [[FOR_INC_1]] +; AVX2: for.inc.1: +; AVX2-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 2 +; AVX2-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]] +; AVX2-NEXT: [[TMP43:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4 +; AVX2-NEXT: [[CMP1_2:%.*]] = icmp slt i32 [[TMP43]], 100 +; AVX2-NEXT: br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]] +; AVX2: if.then.2: +; AVX2-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV_NEXT_1]] +; AVX2-NEXT: [[TMP44:%.*]] = load float, float* [[ARRAYIDX3_2]], align 4 +; AVX2-NEXT: [[CONV_2:%.*]] = sitofp i32 [[TMP43]] to float +; AVX2-NEXT: [[ADD_2:%.*]] = fadd float [[TMP44]], [[CONV_2]] +; AVX2-NEXT: [[ARRAYIDX7_2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT_1]] +; AVX2-NEXT: store float [[ADD_2]], float* [[ARRAYIDX7_2]], align 4 +; AVX2-NEXT: br label [[FOR_INC_2]] +; AVX2: for.inc.2: +; AVX2-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 3 +; AVX2-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]] +; AVX2-NEXT: [[TMP45:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4 +; AVX2-NEXT: [[CMP1_3:%.*]] = icmp slt i32 [[TMP45]], 100 +; AVX2-NEXT: br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]] +; AVX2: if.then.3: +; AVX2-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV_NEXT_2]] +; AVX2-NEXT: [[TMP46:%.*]] = load float, float* [[ARRAYIDX3_3]], align 4 +; AVX2-NEXT: [[CONV_3:%.*]] = sitofp i32 [[TMP45]] to float +; AVX2-NEXT: [[ADD_3:%.*]] = fadd float [[TMP46]], [[CONV_3]] +; AVX2-NEXT: [[ARRAYIDX7_3:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT_2]] +; AVX2-NEXT: store float [[ADD_3]], float* [[ARRAYIDX7_3]], align 4 +; AVX2-NEXT: br label [[FOR_INC_3]] +; AVX2: for.inc.3: +; AVX2-NEXT: [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 4 +; AVX2-NEXT: [[EXITCOND_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_3]], 10000 +; AVX2-NEXT: br i1 [[EXITCOND_3]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !30 +; +; AVX512-LABEL: @foo2( +; AVX512-NEXT: entry: +; AVX512-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[A:%.*]], i64 10000 +; AVX512-NEXT: [[SCEVGEP11:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 10000 +; AVX512-NEXT: [[SCEVGEP14:%.*]] = getelementptr float, float* [[B:%.*]], i64 10000 +; AVX512-NEXT: [[TMP0:%.*]] = bitcast i32* [[SCEVGEP11]] to float* +; AVX512-NEXT: [[BOUND0:%.*]] = icmp ugt float* [[TMP0]], [[A]] +; AVX512-NEXT: [[TMP1:%.*]] = bitcast float* [[SCEVGEP]] to i32* +; AVX512-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[TRIGGER]] +; AVX512-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; AVX512-NEXT: [[BOUND016:%.*]] = icmp ugt float* [[SCEVGEP14]], [[A]] +; AVX512-NEXT: [[BOUND117:%.*]] = icmp ugt float* [[SCEVGEP]], [[B]] +; AVX512-NEXT: [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]] +; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]] +; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]] +; AVX512: vector.body: +; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]] +; AVX512-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <16 x i32>* +; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, <16 x i32>* [[TMP3]], align 4, !alias.scope !21 +; AVX512-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[TMP2]], i64 16 +; AVX512-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <16 x i32>* +; AVX512-NEXT: [[WIDE_LOAD22:%.*]] = load <16 x i32>, <16 x i32>* [[TMP5]], align 4, !alias.scope !21 +; AVX512-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP2]], i64 32 +; AVX512-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <16 x i32>* +; AVX512-NEXT: [[WIDE_LOAD23:%.*]] = load <16 x i32>, <16 x i32>* [[TMP7]], align 4, !alias.scope !21 +; AVX512-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP2]], i64 48 +; AVX512-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <16 x i32>* +; AVX512-NEXT: [[WIDE_LOAD24:%.*]] = load <16 x i32>, <16 x i32>* [[TMP9]], align 4, !alias.scope !21 +; AVX512-NEXT: [[TMP10:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD]], +; AVX512-NEXT: [[TMP11:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD22]], +; AVX512-NEXT: [[TMP12:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD23]], +; AVX512-NEXT: [[TMP13:%.*]] = icmp slt <16 x i32> [[WIDE_LOAD24]], +; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDEX]] +; AVX512-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <16 x float>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP15]], i32 4, <16 x i1> [[TMP10]], <16 x float> undef), !alias.scope !24 +; AVX512-NEXT: [[TMP16:%.*]] = getelementptr float, float* [[TMP14]], i64 16 +; AVX512-NEXT: [[TMP17:%.*]] = bitcast float* [[TMP16]] to <16 x float>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP17]], i32 4, <16 x i1> [[TMP11]], <16 x float> undef), !alias.scope !24 +; AVX512-NEXT: [[TMP18:%.*]] = getelementptr float, float* [[TMP14]], i64 32 +; AVX512-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP18]] to <16 x float>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP19]], i32 4, <16 x i1> [[TMP12]], <16 x float> undef), !alias.scope !24 +; AVX512-NEXT: [[TMP20:%.*]] = getelementptr float, float* [[TMP14]], i64 48 +; AVX512-NEXT: [[TMP21:%.*]] = bitcast float* [[TMP20]] to <16 x float>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0v16f32(<16 x float>* [[TMP21]], i32 4, <16 x i1> [[TMP13]], <16 x float> undef), !alias.scope !24 +; AVX512-NEXT: [[TMP22:%.*]] = sitofp <16 x i32> [[WIDE_LOAD]] to <16 x float> +; AVX512-NEXT: [[TMP23:%.*]] = sitofp <16 x i32> [[WIDE_LOAD22]] to <16 x float> +; AVX512-NEXT: [[TMP24:%.*]] = sitofp <16 x i32> [[WIDE_LOAD23]] to <16 x float> +; AVX512-NEXT: [[TMP25:%.*]] = sitofp <16 x i32> [[WIDE_LOAD24]] to <16 x float> +; AVX512-NEXT: [[TMP26:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD]], [[TMP22]] +; AVX512-NEXT: [[TMP27:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD25]], [[TMP23]] +; AVX512-NEXT: [[TMP28:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD26]], [[TMP24]] +; AVX512-NEXT: [[TMP29:%.*]] = fadd <16 x float> [[WIDE_MASKED_LOAD27]], [[TMP25]] +; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDEX]] +; AVX512-NEXT: [[TMP31:%.*]] = bitcast float* [[TMP30]] to <16 x float>* +; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP26]], <16 x float>* [[TMP31]], i32 4, <16 x i1> [[TMP10]]), !alias.scope !26, !noalias !28 +; AVX512-NEXT: [[TMP32:%.*]] = getelementptr float, float* [[TMP30]], i64 16 +; AVX512-NEXT: [[TMP33:%.*]] = bitcast float* [[TMP32]] to <16 x float>* +; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP27]], <16 x float>* [[TMP33]], i32 4, <16 x i1> [[TMP11]]), !alias.scope !26, !noalias !28 +; AVX512-NEXT: [[TMP34:%.*]] = getelementptr float, float* [[TMP30]], i64 32 +; AVX512-NEXT: [[TMP35:%.*]] = bitcast float* [[TMP34]] to <16 x float>* +; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP28]], <16 x float>* [[TMP35]], i32 4, <16 x i1> [[TMP12]]), !alias.scope !26, !noalias !28 +; AVX512-NEXT: [[TMP36:%.*]] = getelementptr float, float* [[TMP30]], i64 48 +; AVX512-NEXT: [[TMP37:%.*]] = bitcast float* [[TMP36]] to <16 x float>* +; AVX512-NEXT: call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> [[TMP29]], <16 x float>* [[TMP37]], i32 4, <16 x i1> [[TMP13]]), !alias.scope !26, !noalias !28 +; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 64 +; AVX512-NEXT: [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 +; AVX512-NEXT: br i1 [[TMP38]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !29 +; AVX512: for.body.preheader: +; AVX512-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ] +; AVX512-NEXT: br label [[FOR_BODY:%.*]] +; AVX512: for.body: +; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ] +; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] +; AVX512-NEXT: [[TMP39:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; AVX512-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP39]], 100 +; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]] +; AVX512: if.then: +; AVX512-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV]] +; AVX512-NEXT: [[TMP40:%.*]] = load float, float* [[ARRAYIDX3]], align 4 +; AVX512-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP39]] to float +; AVX512-NEXT: [[ADD:%.*]] = fadd float [[TMP40]], [[CONV]] +; AVX512-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]] +; AVX512-NEXT: store float [[ADD]], float* [[ARRAYIDX7]], align 4 +; AVX512-NEXT: br label [[FOR_INC]] +; AVX512: for.inc: +; AVX512-NEXT: [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1 +; AVX512-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]] +; AVX512-NEXT: [[TMP41:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 +; AVX512-NEXT: [[CMP1_1:%.*]] = icmp slt i32 [[TMP41]], 100 +; AVX512-NEXT: br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]] +; AVX512: for.end: +; AVX512-NEXT: ret void +; AVX512: if.then.1: +; AVX512-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV_NEXT]] +; AVX512-NEXT: [[TMP42:%.*]] = load float, float* [[ARRAYIDX3_1]], align 4 +; AVX512-NEXT: [[CONV_1:%.*]] = sitofp i32 [[TMP41]] to float +; AVX512-NEXT: [[ADD_1:%.*]] = fadd float [[TMP42]], [[CONV_1]] +; AVX512-NEXT: [[ARRAYIDX7_1:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT]] +; AVX512-NEXT: store float [[ADD_1]], float* [[ARRAYIDX7_1]], align 4 +; AVX512-NEXT: br label [[FOR_INC_1]] +; AVX512: for.inc.1: +; AVX512-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 2 +; AVX512-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]] +; AVX512-NEXT: [[TMP43:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4 +; AVX512-NEXT: [[CMP1_2:%.*]] = icmp slt i32 [[TMP43]], 100 +; AVX512-NEXT: br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]] +; AVX512: if.then.2: +; AVX512-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV_NEXT_1]] +; AVX512-NEXT: [[TMP44:%.*]] = load float, float* [[ARRAYIDX3_2]], align 4 +; AVX512-NEXT: [[CONV_2:%.*]] = sitofp i32 [[TMP43]] to float +; AVX512-NEXT: [[ADD_2:%.*]] = fadd float [[TMP44]], [[CONV_2]] +; AVX512-NEXT: [[ARRAYIDX7_2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT_1]] +; AVX512-NEXT: store float [[ADD_2]], float* [[ARRAYIDX7_2]], align 4 +; AVX512-NEXT: br label [[FOR_INC_2]] +; AVX512: for.inc.2: +; AVX512-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 3 +; AVX512-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]] +; AVX512-NEXT: [[TMP45:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4 +; AVX512-NEXT: [[CMP1_3:%.*]] = icmp slt i32 [[TMP45]], 100 +; AVX512-NEXT: br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]] +; AVX512: if.then.3: +; AVX512-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV_NEXT_2]] +; AVX512-NEXT: [[TMP46:%.*]] = load float, float* [[ARRAYIDX3_3]], align 4 +; AVX512-NEXT: [[CONV_3:%.*]] = sitofp i32 [[TMP45]] to float +; AVX512-NEXT: [[ADD_3:%.*]] = fadd float [[TMP46]], [[CONV_3]] +; AVX512-NEXT: [[ARRAYIDX7_3:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT_2]] +; AVX512-NEXT: store float [[ADD_3]], float* [[ARRAYIDX7_3]], align 4 +; AVX512-NEXT: br label [[FOR_INC_3]] +; AVX512: for.inc.3: +; AVX512-NEXT: [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 4 +; AVX512-NEXT: [[EXITCOND_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_3]], 10000 +; AVX512-NEXT: br i1 [[EXITCOND_3]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !30 +; entry: %A.addr = alloca float*, align 8 %B.addr = alloca float*, align 8 @@ -260,25 +1417,376 @@ for.end: ; preds = %for.cond ; } ;} -;AVX-LABEL: @foo3 -;AVX: icmp slt <4 x i32> %wide.load, @llvm.masked.load.v4f64.p0v4f64 -;AVX: sitofp <4 x i32> %wide.load to <4 x double> -;AVX: fadd <4 x double> -;AVX: call void @llvm.masked.store.v4f64.p0v4f64 -;AVX: ret void - -;AVX512-LABEL: @foo3 -;AVX512: icmp slt <8 x i32> %wide.load, @llvm.masked.load.v8f64.p0v8f64 -;AVX512: sitofp <8 x i32> %wide.load to <8 x double> -;AVX512: fadd <8 x double> -;AVX512: call void @llvm.masked.store.v8f64.p0v8f64 -;AVX512: ret void - - ; Function Attrs: nounwind uwtable define void @foo3(double* %A, double* %B, i32* %trigger) #0 { +; AVX1-LABEL: @foo3( +; AVX1-NEXT: entry: +; AVX1-NEXT: [[SCEVGEP:%.*]] = getelementptr double, double* [[A:%.*]], i64 10000 +; AVX1-NEXT: [[SCEVGEP11:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 10000 +; AVX1-NEXT: [[SCEVGEP14:%.*]] = getelementptr double, double* [[B:%.*]], i64 10000 +; AVX1-NEXT: [[TMP0:%.*]] = bitcast i32* [[SCEVGEP11]] to double* +; AVX1-NEXT: [[BOUND0:%.*]] = icmp ugt double* [[TMP0]], [[A]] +; AVX1-NEXT: [[TMP1:%.*]] = bitcast double* [[SCEVGEP]] to i32* +; AVX1-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[TRIGGER]] +; AVX1-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; AVX1-NEXT: [[BOUND016:%.*]] = icmp ugt double* [[SCEVGEP14]], [[A]] +; AVX1-NEXT: [[BOUND117:%.*]] = icmp ugt double* [[SCEVGEP]], [[B]] +; AVX1-NEXT: [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]] +; AVX1-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]] +; AVX1-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]] +; AVX1: vector.body: +; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; AVX1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]] +; AVX1-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* +; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4, !alias.scope !31 +; AVX1-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[TMP2]], i64 4 +; AVX1-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>* +; AVX1-NEXT: [[WIDE_LOAD22:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4, !alias.scope !31 +; AVX1-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP2]], i64 8 +; AVX1-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* +; AVX1-NEXT: [[WIDE_LOAD23:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4, !alias.scope !31 +; AVX1-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP2]], i64 12 +; AVX1-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* +; AVX1-NEXT: [[WIDE_LOAD24:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4, !alias.scope !31 +; AVX1-NEXT: [[TMP10:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], +; AVX1-NEXT: [[TMP11:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD22]], +; AVX1-NEXT: [[TMP12:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD23]], +; AVX1-NEXT: [[TMP13:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD24]], +; AVX1-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDEX]] +; AVX1-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP14]] to <4 x double>* +; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP15]], i32 8, <4 x i1> [[TMP10]], <4 x double> undef), !alias.scope !34 +; AVX1-NEXT: [[TMP16:%.*]] = getelementptr double, double* [[TMP14]], i64 4 +; AVX1-NEXT: [[TMP17:%.*]] = bitcast double* [[TMP16]] to <4 x double>* +; AVX1-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP17]], i32 8, <4 x i1> [[TMP11]], <4 x double> undef), !alias.scope !34 +; AVX1-NEXT: [[TMP18:%.*]] = getelementptr double, double* [[TMP14]], i64 8 +; AVX1-NEXT: [[TMP19:%.*]] = bitcast double* [[TMP18]] to <4 x double>* +; AVX1-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP19]], i32 8, <4 x i1> [[TMP12]], <4 x double> undef), !alias.scope !34 +; AVX1-NEXT: [[TMP20:%.*]] = getelementptr double, double* [[TMP14]], i64 12 +; AVX1-NEXT: [[TMP21:%.*]] = bitcast double* [[TMP20]] to <4 x double>* +; AVX1-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP21]], i32 8, <4 x i1> [[TMP13]], <4 x double> undef), !alias.scope !34 +; AVX1-NEXT: [[TMP22:%.*]] = sitofp <4 x i32> [[WIDE_LOAD]] to <4 x double> +; AVX1-NEXT: [[TMP23:%.*]] = sitofp <4 x i32> [[WIDE_LOAD22]] to <4 x double> +; AVX1-NEXT: [[TMP24:%.*]] = sitofp <4 x i32> [[WIDE_LOAD23]] to <4 x double> +; AVX1-NEXT: [[TMP25:%.*]] = sitofp <4 x i32> [[WIDE_LOAD24]] to <4 x double> +; AVX1-NEXT: [[TMP26:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], [[TMP22]] +; AVX1-NEXT: [[TMP27:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD25]], [[TMP23]] +; AVX1-NEXT: [[TMP28:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD26]], [[TMP24]] +; AVX1-NEXT: [[TMP29:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD27]], [[TMP25]] +; AVX1-NEXT: [[TMP30:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDEX]] +; AVX1-NEXT: [[TMP31:%.*]] = bitcast double* [[TMP30]] to <4 x double>* +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP26]], <4 x double>* [[TMP31]], i32 8, <4 x i1> [[TMP10]]), !alias.scope !36, !noalias !38 +; AVX1-NEXT: [[TMP32:%.*]] = getelementptr double, double* [[TMP30]], i64 4 +; AVX1-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP32]] to <4 x double>* +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP27]], <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP11]]), !alias.scope !36, !noalias !38 +; AVX1-NEXT: [[TMP34:%.*]] = getelementptr double, double* [[TMP30]], i64 8 +; AVX1-NEXT: [[TMP35:%.*]] = bitcast double* [[TMP34]] to <4 x double>* +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP28]], <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP12]]), !alias.scope !36, !noalias !38 +; AVX1-NEXT: [[TMP36:%.*]] = getelementptr double, double* [[TMP30]], i64 12 +; AVX1-NEXT: [[TMP37:%.*]] = bitcast double* [[TMP36]] to <4 x double>* +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP29]], <4 x double>* [[TMP37]], i32 8, <4 x i1> [[TMP13]]), !alias.scope !36, !noalias !38 +; AVX1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 +; AVX1-NEXT: [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 +; AVX1-NEXT: br i1 [[TMP38]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !39 +; AVX1: for.body: +; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_1:%.*]], [[FOR_INC_1:%.*]] ], [ 0, [[ENTRY]] ] +; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] +; AVX1-NEXT: [[TMP39:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; AVX1-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP39]], 100 +; AVX1-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]] +; AVX1: if.then: +; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV]] +; AVX1-NEXT: [[TMP40:%.*]] = load double, double* [[ARRAYIDX3]], align 8 +; AVX1-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP39]] to double +; AVX1-NEXT: [[ADD:%.*]] = fadd double [[TMP40]], [[CONV]] +; AVX1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV]] +; AVX1-NEXT: store double [[ADD]], double* [[ARRAYIDX7]], align 8 +; AVX1-NEXT: br label [[FOR_INC]] +; AVX1: for.inc: +; AVX1-NEXT: [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1 +; AVX1-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]] +; AVX1-NEXT: [[TMP41:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 +; AVX1-NEXT: [[CMP1_1:%.*]] = icmp slt i32 [[TMP41]], 100 +; AVX1-NEXT: br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1]] +; AVX1: for.end: +; AVX1-NEXT: ret void +; AVX1: if.then.1: +; AVX1-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV_NEXT]] +; AVX1-NEXT: [[TMP42:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8 +; AVX1-NEXT: [[CONV_1:%.*]] = sitofp i32 [[TMP41]] to double +; AVX1-NEXT: [[ADD_1:%.*]] = fadd double [[TMP42]], [[CONV_1]] +; AVX1-NEXT: [[ARRAYIDX7_1:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT]] +; AVX1-NEXT: store double [[ADD_1]], double* [[ARRAYIDX7_1]], align 8 +; AVX1-NEXT: br label [[FOR_INC_1]] +; AVX1: for.inc.1: +; AVX1-NEXT: [[INDVARS_IV_NEXT_1]] = add nuw nsw i64 [[INDVARS_IV]], 2 +; AVX1-NEXT: [[EXITCOND_1:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_1]], 10000 +; AVX1-NEXT: br i1 [[EXITCOND_1]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !40 +; +; AVX2-LABEL: @foo3( +; AVX2-NEXT: entry: +; AVX2-NEXT: [[SCEVGEP:%.*]] = getelementptr double, double* [[A:%.*]], i64 10000 +; AVX2-NEXT: [[SCEVGEP11:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 10000 +; AVX2-NEXT: [[SCEVGEP14:%.*]] = getelementptr double, double* [[B:%.*]], i64 10000 +; AVX2-NEXT: [[TMP0:%.*]] = bitcast i32* [[SCEVGEP11]] to double* +; AVX2-NEXT: [[BOUND0:%.*]] = icmp ugt double* [[TMP0]], [[A]] +; AVX2-NEXT: [[TMP1:%.*]] = bitcast double* [[SCEVGEP]] to i32* +; AVX2-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[TRIGGER]] +; AVX2-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; AVX2-NEXT: [[BOUND016:%.*]] = icmp ugt double* [[SCEVGEP14]], [[A]] +; AVX2-NEXT: [[BOUND117:%.*]] = icmp ugt double* [[SCEVGEP]], [[B]] +; AVX2-NEXT: [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]] +; AVX2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]] +; AVX2-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]] +; AVX2: vector.body: +; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; AVX2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]] +; AVX2-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* +; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4, !alias.scope !31 +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[TMP2]], i64 4 +; AVX2-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>* +; AVX2-NEXT: [[WIDE_LOAD22:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4, !alias.scope !31 +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP2]], i64 8 +; AVX2-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* +; AVX2-NEXT: [[WIDE_LOAD23:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4, !alias.scope !31 +; AVX2-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP2]], i64 12 +; AVX2-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* +; AVX2-NEXT: [[WIDE_LOAD24:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4, !alias.scope !31 +; AVX2-NEXT: [[TMP10:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD]], +; AVX2-NEXT: [[TMP11:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD22]], +; AVX2-NEXT: [[TMP12:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD23]], +; AVX2-NEXT: [[TMP13:%.*]] = icmp slt <4 x i32> [[WIDE_LOAD24]], +; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDEX]] +; AVX2-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP14]] to <4 x double>* +; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP15]], i32 8, <4 x i1> [[TMP10]], <4 x double> undef), !alias.scope !34 +; AVX2-NEXT: [[TMP16:%.*]] = getelementptr double, double* [[TMP14]], i64 4 +; AVX2-NEXT: [[TMP17:%.*]] = bitcast double* [[TMP16]] to <4 x double>* +; AVX2-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP17]], i32 8, <4 x i1> [[TMP11]], <4 x double> undef), !alias.scope !34 +; AVX2-NEXT: [[TMP18:%.*]] = getelementptr double, double* [[TMP14]], i64 8 +; AVX2-NEXT: [[TMP19:%.*]] = bitcast double* [[TMP18]] to <4 x double>* +; AVX2-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP19]], i32 8, <4 x i1> [[TMP12]], <4 x double> undef), !alias.scope !34 +; AVX2-NEXT: [[TMP20:%.*]] = getelementptr double, double* [[TMP14]], i64 12 +; AVX2-NEXT: [[TMP21:%.*]] = bitcast double* [[TMP20]] to <4 x double>* +; AVX2-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP21]], i32 8, <4 x i1> [[TMP13]], <4 x double> undef), !alias.scope !34 +; AVX2-NEXT: [[TMP22:%.*]] = sitofp <4 x i32> [[WIDE_LOAD]] to <4 x double> +; AVX2-NEXT: [[TMP23:%.*]] = sitofp <4 x i32> [[WIDE_LOAD22]] to <4 x double> +; AVX2-NEXT: [[TMP24:%.*]] = sitofp <4 x i32> [[WIDE_LOAD23]] to <4 x double> +; AVX2-NEXT: [[TMP25:%.*]] = sitofp <4 x i32> [[WIDE_LOAD24]] to <4 x double> +; AVX2-NEXT: [[TMP26:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], [[TMP22]] +; AVX2-NEXT: [[TMP27:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD25]], [[TMP23]] +; AVX2-NEXT: [[TMP28:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD26]], [[TMP24]] +; AVX2-NEXT: [[TMP29:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD27]], [[TMP25]] +; AVX2-NEXT: [[TMP30:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDEX]] +; AVX2-NEXT: [[TMP31:%.*]] = bitcast double* [[TMP30]] to <4 x double>* +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP26]], <4 x double>* [[TMP31]], i32 8, <4 x i1> [[TMP10]]), !alias.scope !36, !noalias !38 +; AVX2-NEXT: [[TMP32:%.*]] = getelementptr double, double* [[TMP30]], i64 4 +; AVX2-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP32]] to <4 x double>* +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP27]], <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP11]]), !alias.scope !36, !noalias !38 +; AVX2-NEXT: [[TMP34:%.*]] = getelementptr double, double* [[TMP30]], i64 8 +; AVX2-NEXT: [[TMP35:%.*]] = bitcast double* [[TMP34]] to <4 x double>* +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP28]], <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP12]]), !alias.scope !36, !noalias !38 +; AVX2-NEXT: [[TMP36:%.*]] = getelementptr double, double* [[TMP30]], i64 12 +; AVX2-NEXT: [[TMP37:%.*]] = bitcast double* [[TMP36]] to <4 x double>* +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP29]], <4 x double>* [[TMP37]], i32 8, <4 x i1> [[TMP13]]), !alias.scope !36, !noalias !38 +; AVX2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 +; AVX2-NEXT: [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 +; AVX2-NEXT: br i1 [[TMP38]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !39 +; AVX2: for.body: +; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ], [ 0, [[ENTRY]] ] +; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] +; AVX2-NEXT: [[TMP39:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; AVX2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP39]], 100 +; AVX2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]] +; AVX2: if.then: +; AVX2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV]] +; AVX2-NEXT: [[TMP40:%.*]] = load double, double* [[ARRAYIDX3]], align 8 +; AVX2-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP39]] to double +; AVX2-NEXT: [[ADD:%.*]] = fadd double [[TMP40]], [[CONV]] +; AVX2-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV]] +; AVX2-NEXT: store double [[ADD]], double* [[ARRAYIDX7]], align 8 +; AVX2-NEXT: br label [[FOR_INC]] +; AVX2: for.inc: +; AVX2-NEXT: [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1 +; AVX2-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]] +; AVX2-NEXT: [[TMP41:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 +; AVX2-NEXT: [[CMP1_1:%.*]] = icmp slt i32 [[TMP41]], 100 +; AVX2-NEXT: br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]] +; AVX2: for.end: +; AVX2-NEXT: ret void +; AVX2: if.then.1: +; AVX2-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV_NEXT]] +; AVX2-NEXT: [[TMP42:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8 +; AVX2-NEXT: [[CONV_1:%.*]] = sitofp i32 [[TMP41]] to double +; AVX2-NEXT: [[ADD_1:%.*]] = fadd double [[TMP42]], [[CONV_1]] +; AVX2-NEXT: [[ARRAYIDX7_1:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT]] +; AVX2-NEXT: store double [[ADD_1]], double* [[ARRAYIDX7_1]], align 8 +; AVX2-NEXT: br label [[FOR_INC_1]] +; AVX2: for.inc.1: +; AVX2-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 2 +; AVX2-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]] +; AVX2-NEXT: [[TMP43:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4 +; AVX2-NEXT: [[CMP1_2:%.*]] = icmp slt i32 [[TMP43]], 100 +; AVX2-NEXT: br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]] +; AVX2: if.then.2: +; AVX2-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV_NEXT_1]] +; AVX2-NEXT: [[TMP44:%.*]] = load double, double* [[ARRAYIDX3_2]], align 8 +; AVX2-NEXT: [[CONV_2:%.*]] = sitofp i32 [[TMP43]] to double +; AVX2-NEXT: [[ADD_2:%.*]] = fadd double [[TMP44]], [[CONV_2]] +; AVX2-NEXT: [[ARRAYIDX7_2:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT_1]] +; AVX2-NEXT: store double [[ADD_2]], double* [[ARRAYIDX7_2]], align 8 +; AVX2-NEXT: br label [[FOR_INC_2]] +; AVX2: for.inc.2: +; AVX2-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 3 +; AVX2-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]] +; AVX2-NEXT: [[TMP45:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4 +; AVX2-NEXT: [[CMP1_3:%.*]] = icmp slt i32 [[TMP45]], 100 +; AVX2-NEXT: br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]] +; AVX2: if.then.3: +; AVX2-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV_NEXT_2]] +; AVX2-NEXT: [[TMP46:%.*]] = load double, double* [[ARRAYIDX3_3]], align 8 +; AVX2-NEXT: [[CONV_3:%.*]] = sitofp i32 [[TMP45]] to double +; AVX2-NEXT: [[ADD_3:%.*]] = fadd double [[TMP46]], [[CONV_3]] +; AVX2-NEXT: [[ARRAYIDX7_3:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT_2]] +; AVX2-NEXT: store double [[ADD_3]], double* [[ARRAYIDX7_3]], align 8 +; AVX2-NEXT: br label [[FOR_INC_3]] +; AVX2: for.inc.3: +; AVX2-NEXT: [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 4 +; AVX2-NEXT: [[EXITCOND_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_3]], 10000 +; AVX2-NEXT: br i1 [[EXITCOND_3]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !40 +; +; AVX512-LABEL: @foo3( +; AVX512-NEXT: entry: +; AVX512-NEXT: [[SCEVGEP:%.*]] = getelementptr double, double* [[A:%.*]], i64 10000 +; AVX512-NEXT: [[SCEVGEP11:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 10000 +; AVX512-NEXT: [[SCEVGEP14:%.*]] = getelementptr double, double* [[B:%.*]], i64 10000 +; AVX512-NEXT: [[TMP0:%.*]] = bitcast i32* [[SCEVGEP11]] to double* +; AVX512-NEXT: [[BOUND0:%.*]] = icmp ugt double* [[TMP0]], [[A]] +; AVX512-NEXT: [[TMP1:%.*]] = bitcast double* [[SCEVGEP]] to i32* +; AVX512-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[TRIGGER]] +; AVX512-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; AVX512-NEXT: [[BOUND016:%.*]] = icmp ugt double* [[SCEVGEP14]], [[A]] +; AVX512-NEXT: [[BOUND117:%.*]] = icmp ugt double* [[SCEVGEP]], [[B]] +; AVX512-NEXT: [[FOUND_CONFLICT18:%.*]] = and i1 [[BOUND016]], [[BOUND117]] +; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT18]] +; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]] +; AVX512: vector.body: +; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDEX]] +; AVX512-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <8 x i32>* +; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 4, !alias.scope !31 +; AVX512-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[TMP2]], i64 8 +; AVX512-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>* +; AVX512-NEXT: [[WIDE_LOAD22:%.*]] = load <8 x i32>, <8 x i32>* [[TMP5]], align 4, !alias.scope !31 +; AVX512-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP2]], i64 16 +; AVX512-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>* +; AVX512-NEXT: [[WIDE_LOAD23:%.*]] = load <8 x i32>, <8 x i32>* [[TMP7]], align 4, !alias.scope !31 +; AVX512-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP2]], i64 24 +; AVX512-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>* +; AVX512-NEXT: [[WIDE_LOAD24:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4, !alias.scope !31 +; AVX512-NEXT: [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD]], +; AVX512-NEXT: [[TMP11:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD22]], +; AVX512-NEXT: [[TMP12:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD23]], +; AVX512-NEXT: [[TMP13:%.*]] = icmp slt <8 x i32> [[WIDE_LOAD24]], +; AVX512-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDEX]] +; AVX512-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP14]] to <8 x double>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP15]], i32 8, <8 x i1> [[TMP10]], <8 x double> undef), !alias.scope !34 +; AVX512-NEXT: [[TMP16:%.*]] = getelementptr double, double* [[TMP14]], i64 8 +; AVX512-NEXT: [[TMP17:%.*]] = bitcast double* [[TMP16]] to <8 x double>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD25:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP17]], i32 8, <8 x i1> [[TMP11]], <8 x double> undef), !alias.scope !34 +; AVX512-NEXT: [[TMP18:%.*]] = getelementptr double, double* [[TMP14]], i64 16 +; AVX512-NEXT: [[TMP19:%.*]] = bitcast double* [[TMP18]] to <8 x double>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD26:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP19]], i32 8, <8 x i1> [[TMP12]], <8 x double> undef), !alias.scope !34 +; AVX512-NEXT: [[TMP20:%.*]] = getelementptr double, double* [[TMP14]], i64 24 +; AVX512-NEXT: [[TMP21:%.*]] = bitcast double* [[TMP20]] to <8 x double>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP21]], i32 8, <8 x i1> [[TMP13]], <8 x double> undef), !alias.scope !34 +; AVX512-NEXT: [[TMP22:%.*]] = sitofp <8 x i32> [[WIDE_LOAD]] to <8 x double> +; AVX512-NEXT: [[TMP23:%.*]] = sitofp <8 x i32> [[WIDE_LOAD22]] to <8 x double> +; AVX512-NEXT: [[TMP24:%.*]] = sitofp <8 x i32> [[WIDE_LOAD23]] to <8 x double> +; AVX512-NEXT: [[TMP25:%.*]] = sitofp <8 x i32> [[WIDE_LOAD24]] to <8 x double> +; AVX512-NEXT: [[TMP26:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD]], [[TMP22]] +; AVX512-NEXT: [[TMP27:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD25]], [[TMP23]] +; AVX512-NEXT: [[TMP28:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD26]], [[TMP24]] +; AVX512-NEXT: [[TMP29:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD27]], [[TMP25]] +; AVX512-NEXT: [[TMP30:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDEX]] +; AVX512-NEXT: [[TMP31:%.*]] = bitcast double* [[TMP30]] to <8 x double>* +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP26]], <8 x double>* [[TMP31]], i32 8, <8 x i1> [[TMP10]]), !alias.scope !36, !noalias !38 +; AVX512-NEXT: [[TMP32:%.*]] = getelementptr double, double* [[TMP30]], i64 8 +; AVX512-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP32]] to <8 x double>* +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP27]], <8 x double>* [[TMP33]], i32 8, <8 x i1> [[TMP11]]), !alias.scope !36, !noalias !38 +; AVX512-NEXT: [[TMP34:%.*]] = getelementptr double, double* [[TMP30]], i64 16 +; AVX512-NEXT: [[TMP35:%.*]] = bitcast double* [[TMP34]] to <8 x double>* +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP28]], <8 x double>* [[TMP35]], i32 8, <8 x i1> [[TMP12]]), !alias.scope !36, !noalias !38 +; AVX512-NEXT: [[TMP36:%.*]] = getelementptr double, double* [[TMP30]], i64 24 +; AVX512-NEXT: [[TMP37:%.*]] = bitcast double* [[TMP36]] to <8 x double>* +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP29]], <8 x double>* [[TMP37]], i32 8, <8 x i1> [[TMP13]]), !alias.scope !36, !noalias !38 +; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 +; AVX512-NEXT: [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], 9984 +; AVX512-NEXT: br i1 [[TMP38]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !39 +; AVX512: for.body.preheader: +; AVX512-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ] +; AVX512-NEXT: br label [[FOR_BODY:%.*]] +; AVX512: for.body: +; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ] +; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] +; AVX512-NEXT: [[TMP39:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; AVX512-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP39]], 100 +; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]] +; AVX512: if.then: +; AVX512-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV]] +; AVX512-NEXT: [[TMP40:%.*]] = load double, double* [[ARRAYIDX3]], align 8 +; AVX512-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP39]] to double +; AVX512-NEXT: [[ADD:%.*]] = fadd double [[TMP40]], [[CONV]] +; AVX512-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV]] +; AVX512-NEXT: store double [[ADD]], double* [[ARRAYIDX7]], align 8 +; AVX512-NEXT: br label [[FOR_INC]] +; AVX512: for.inc: +; AVX512-NEXT: [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1 +; AVX512-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]] +; AVX512-NEXT: [[TMP41:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 +; AVX512-NEXT: [[CMP1_1:%.*]] = icmp slt i32 [[TMP41]], 100 +; AVX512-NEXT: br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]] +; AVX512: for.end: +; AVX512-NEXT: ret void +; AVX512: if.then.1: +; AVX512-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV_NEXT]] +; AVX512-NEXT: [[TMP42:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8 +; AVX512-NEXT: [[CONV_1:%.*]] = sitofp i32 [[TMP41]] to double +; AVX512-NEXT: [[ADD_1:%.*]] = fadd double [[TMP42]], [[CONV_1]] +; AVX512-NEXT: [[ARRAYIDX7_1:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT]] +; AVX512-NEXT: store double [[ADD_1]], double* [[ARRAYIDX7_1]], align 8 +; AVX512-NEXT: br label [[FOR_INC_1]] +; AVX512: for.inc.1: +; AVX512-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 2 +; AVX512-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]] +; AVX512-NEXT: [[TMP43:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4 +; AVX512-NEXT: [[CMP1_2:%.*]] = icmp slt i32 [[TMP43]], 100 +; AVX512-NEXT: br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]] +; AVX512: if.then.2: +; AVX512-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV_NEXT_1]] +; AVX512-NEXT: [[TMP44:%.*]] = load double, double* [[ARRAYIDX3_2]], align 8 +; AVX512-NEXT: [[CONV_2:%.*]] = sitofp i32 [[TMP43]] to double +; AVX512-NEXT: [[ADD_2:%.*]] = fadd double [[TMP44]], [[CONV_2]] +; AVX512-NEXT: [[ARRAYIDX7_2:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT_1]] +; AVX512-NEXT: store double [[ADD_2]], double* [[ARRAYIDX7_2]], align 8 +; AVX512-NEXT: br label [[FOR_INC_2]] +; AVX512: for.inc.2: +; AVX512-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 3 +; AVX512-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]] +; AVX512-NEXT: [[TMP45:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4 +; AVX512-NEXT: [[CMP1_3:%.*]] = icmp slt i32 [[TMP45]], 100 +; AVX512-NEXT: br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]] +; AVX512: if.then.3: +; AVX512-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[INDVARS_IV_NEXT_2]] +; AVX512-NEXT: [[TMP46:%.*]] = load double, double* [[ARRAYIDX3_3]], align 8 +; AVX512-NEXT: [[CONV_3:%.*]] = sitofp i32 [[TMP45]] to double +; AVX512-NEXT: [[ADD_3:%.*]] = fadd double [[TMP46]], [[CONV_3]] +; AVX512-NEXT: [[ARRAYIDX7_3:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT_2]] +; AVX512-NEXT: store double [[ADD_3]], double* [[ARRAYIDX7_3]], align 8 +; AVX512-NEXT: br label [[FOR_INC_3]] +; AVX512: for.inc.3: +; AVX512-NEXT: [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 4 +; AVX512-NEXT: [[EXITCOND_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_3]], 10000 +; AVX512-NEXT: br i1 [[EXITCOND_3]], label [[FOR_END:%.*]], label [[FOR_BODY]], !llvm.loop !40 +; entry: %A.addr = alloca double*, align 8 %B.addr = alloca double*, align 8 @@ -348,17 +1856,272 @@ for.end: ; preds = %for.cond ; } ;} -;AVX-LABEL: @foo4 -;AVX-NOT: llvm.masked -;AVX: ret void - -;AVX512-LABEL: @foo4 -;AVX512-NOT: llvm.masked.load -;AVX512: llvm.masked.gather -;AVX512: ret void - ; Function Attrs: nounwind uwtable define void @foo4(double* %A, double* %B, i32* %trigger) { +; AVX1-LABEL: @foo4( +; AVX1-NEXT: entry: +; AVX1-NEXT: br label [[FOR_BODY:%.*]] +; AVX1: for.body: +; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT_1:%.*]], [[FOR_INC_1:%.*]] ] +; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDVARS_IV]] +; AVX1-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; AVX1-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP0]], 100 +; AVX1-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]] +; AVX1: if.then: +; AVX1-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1 +; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 [[TMP1]] +; AVX1-NEXT: [[TMP2:%.*]] = load double, double* [[ARRAYIDX3]], align 8 +; AVX1-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to double +; AVX1-NEXT: [[ADD:%.*]] = fadd double [[TMP2]], [[CONV]] +; AVX1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[INDVARS_IV]] +; AVX1-NEXT: store double [[ADD]], double* [[ARRAYIDX7]], align 8 +; AVX1-NEXT: br label [[FOR_INC]] +; AVX1: for.inc: +; AVX1-NEXT: [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 16 +; AVX1-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000 +; AVX1-NEXT: br i1 [[CMP]], label [[FOR_BODY_1:%.*]], label [[FOR_END:%.*]] +; AVX1: for.end: +; AVX1-NEXT: ret void +; AVX1: for.body.1: +; AVX1-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]] +; AVX1-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 +; AVX1-NEXT: [[CMP1_1:%.*]] = icmp slt i32 [[TMP3]], 100 +; AVX1-NEXT: br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1]] +; AVX1: if.then.1: +; AVX1-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[INDVARS_IV_NEXT]], 1 +; AVX1-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP4]] +; AVX1-NEXT: [[TMP5:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8 +; AVX1-NEXT: [[CONV_1:%.*]] = sitofp i32 [[TMP3]] to double +; AVX1-NEXT: [[ADD_1:%.*]] = fadd double [[TMP5]], [[CONV_1]] +; AVX1-NEXT: [[ARRAYIDX7_1:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT]] +; AVX1-NEXT: store double [[ADD_1]], double* [[ARRAYIDX7_1]], align 8 +; AVX1-NEXT: br label [[FOR_INC_1]] +; AVX1: for.inc.1: +; AVX1-NEXT: [[INDVARS_IV_NEXT_1]] = add nuw nsw i64 [[INDVARS_IV]], 32 +; AVX1-NEXT: br label [[FOR_BODY]] +; +; AVX2-LABEL: @foo4( +; AVX2-NEXT: entry: +; AVX2-NEXT: br label [[FOR_BODY:%.*]] +; AVX2: for.body: +; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ] +; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDVARS_IV]] +; AVX2-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; AVX2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP0]], 100 +; AVX2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]] +; AVX2: if.then: +; AVX2-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1 +; AVX2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 [[TMP1]] +; AVX2-NEXT: [[TMP2:%.*]] = load double, double* [[ARRAYIDX3]], align 8 +; AVX2-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP0]] to double +; AVX2-NEXT: [[ADD:%.*]] = fadd double [[TMP2]], [[CONV]] +; AVX2-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[INDVARS_IV]] +; AVX2-NEXT: store double [[ADD]], double* [[ARRAYIDX7]], align 8 +; AVX2-NEXT: br label [[FOR_INC]] +; AVX2: for.inc: +; AVX2-NEXT: [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 16 +; AVX2-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT]], 10000 +; AVX2-NEXT: br i1 [[CMP]], label [[FOR_BODY_1:%.*]], label [[FOR_END:%.*]] +; AVX2: for.end: +; AVX2-NEXT: ret void +; AVX2: for.body.1: +; AVX2-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]] +; AVX2-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 +; AVX2-NEXT: [[CMP1_1:%.*]] = icmp slt i32 [[TMP3]], 100 +; AVX2-NEXT: br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]] +; AVX2: if.then.1: +; AVX2-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[INDVARS_IV_NEXT]], 1 +; AVX2-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP4]] +; AVX2-NEXT: [[TMP5:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8 +; AVX2-NEXT: [[CONV_1:%.*]] = sitofp i32 [[TMP3]] to double +; AVX2-NEXT: [[ADD_1:%.*]] = fadd double [[TMP5]], [[CONV_1]] +; AVX2-NEXT: [[ARRAYIDX7_1:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT]] +; AVX2-NEXT: store double [[ADD_1]], double* [[ARRAYIDX7_1]], align 8 +; AVX2-NEXT: br label [[FOR_INC_1]] +; AVX2: for.inc.1: +; AVX2-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 32 +; AVX2-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]] +; AVX2-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4 +; AVX2-NEXT: [[CMP1_2:%.*]] = icmp slt i32 [[TMP6]], 100 +; AVX2-NEXT: br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]] +; AVX2: if.then.2: +; AVX2-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[INDVARS_IV_NEXT_1]], 1 +; AVX2-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP7]] +; AVX2-NEXT: [[TMP8:%.*]] = load double, double* [[ARRAYIDX3_2]], align 8 +; AVX2-NEXT: [[CONV_2:%.*]] = sitofp i32 [[TMP6]] to double +; AVX2-NEXT: [[ADD_2:%.*]] = fadd double [[TMP8]], [[CONV_2]] +; AVX2-NEXT: [[ARRAYIDX7_2:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT_1]] +; AVX2-NEXT: store double [[ADD_2]], double* [[ARRAYIDX7_2]], align 8 +; AVX2-NEXT: br label [[FOR_INC_2]] +; AVX2: for.inc.2: +; AVX2-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 48 +; AVX2-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]] +; AVX2-NEXT: [[TMP9:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4 +; AVX2-NEXT: [[CMP1_3:%.*]] = icmp slt i32 [[TMP9]], 100 +; AVX2-NEXT: br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]] +; AVX2: if.then.3: +; AVX2-NEXT: [[TMP10:%.*]] = shl nuw nsw i64 [[INDVARS_IV_NEXT_2]], 1 +; AVX2-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP10]] +; AVX2-NEXT: [[TMP11:%.*]] = load double, double* [[ARRAYIDX3_3]], align 8 +; AVX2-NEXT: [[CONV_3:%.*]] = sitofp i32 [[TMP9]] to double +; AVX2-NEXT: [[ADD_3:%.*]] = fadd double [[TMP11]], [[CONV_3]] +; AVX2-NEXT: [[ARRAYIDX7_3:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT_2]] +; AVX2-NEXT: store double [[ADD_3]], double* [[ARRAYIDX7_3]], align 8 +; AVX2-NEXT: br label [[FOR_INC_3]] +; AVX2: for.inc.3: +; AVX2-NEXT: [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 64 +; AVX2-NEXT: br label [[FOR_BODY]] +; +; AVX512-LABEL: @foo4( +; AVX512-NEXT: entry: +; AVX512-NEXT: [[SCEVGEP:%.*]] = getelementptr double, double* [[A:%.*]], i64 9985 +; AVX512-NEXT: [[SCEVGEP12:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 9985 +; AVX512-NEXT: [[SCEVGEP15:%.*]] = getelementptr double, double* [[B:%.*]], i64 19969 +; AVX512-NEXT: [[TMP0:%.*]] = bitcast i32* [[SCEVGEP12]] to double* +; AVX512-NEXT: [[BOUND0:%.*]] = icmp ugt double* [[TMP0]], [[A]] +; AVX512-NEXT: [[TMP1:%.*]] = bitcast double* [[SCEVGEP]] to i32* +; AVX512-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[TRIGGER]] +; AVX512-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; AVX512-NEXT: [[BOUND017:%.*]] = icmp ugt double* [[SCEVGEP15]], [[A]] +; AVX512-NEXT: [[BOUND118:%.*]] = icmp ugt double* [[SCEVGEP]], [[B]] +; AVX512-NEXT: [[FOUND_CONFLICT19:%.*]] = and i1 [[BOUND017]], [[BOUND118]] +; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT19]] +; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]] +; AVX512: vector.body: +; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT_2:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; AVX512-NEXT: [[VEC_IND:%.*]] = phi <8 x i64> [ [[VEC_IND_NEXT_2:%.*]], [[VECTOR_BODY]] ], [ , [[ENTRY]] ] +; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <8 x i64> [[VEC_IND]] +; AVX512-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP2]], i32 4, <8 x i1> , <8 x i32> undef), !alias.scope !41 +; AVX512-NEXT: [[TMP3:%.*]] = icmp slt <8 x i32> [[WIDE_MASKED_GATHER]], +; AVX512-NEXT: [[TMP4:%.*]] = shl nuw nsw <8 x i64> [[VEC_IND]], +; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds double, double* [[B]], <8 x i64> [[TMP4]] +; AVX512-NEXT: [[WIDE_MASKED_GATHER20:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> [[TMP5]], i32 8, <8 x i1> [[TMP3]], <8 x double> undef), !alias.scope !44 +; AVX512-NEXT: [[TMP6:%.*]] = sitofp <8 x i32> [[WIDE_MASKED_GATHER]] to <8 x double> +; AVX512-NEXT: [[TMP7:%.*]] = fadd <8 x double> [[WIDE_MASKED_GATHER20]], [[TMP6]] +; AVX512-NEXT: [[TMP8:%.*]] = getelementptr inbounds double, double* [[A]], <8 x i64> [[VEC_IND]] +; AVX512-NEXT: call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> [[TMP7]], <8 x double*> [[TMP8]], i32 8, <8 x i1> [[TMP3]]), !alias.scope !46, !noalias !48 +; AVX512-NEXT: [[VEC_IND_NEXT:%.*]] = add <8 x i64> [[VEC_IND]], +; AVX512-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <8 x i64> [[VEC_IND_NEXT]] +; AVX512-NEXT: [[WIDE_MASKED_GATHER_1:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP9]], i32 4, <8 x i1> , <8 x i32> undef), !alias.scope !41 +; AVX512-NEXT: [[TMP10:%.*]] = icmp slt <8 x i32> [[WIDE_MASKED_GATHER_1]], +; AVX512-NEXT: [[TMP11:%.*]] = shl nuw nsw <8 x i64> [[VEC_IND_NEXT]], +; AVX512-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, double* [[B]], <8 x i64> [[TMP11]] +; AVX512-NEXT: [[WIDE_MASKED_GATHER20_1:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> [[TMP12]], i32 8, <8 x i1> [[TMP10]], <8 x double> undef), !alias.scope !44 +; AVX512-NEXT: [[TMP13:%.*]] = sitofp <8 x i32> [[WIDE_MASKED_GATHER_1]] to <8 x double> +; AVX512-NEXT: [[TMP14:%.*]] = fadd <8 x double> [[WIDE_MASKED_GATHER20_1]], [[TMP13]] +; AVX512-NEXT: [[TMP15:%.*]] = getelementptr inbounds double, double* [[A]], <8 x i64> [[VEC_IND_NEXT]] +; AVX512-NEXT: call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> [[TMP14]], <8 x double*> [[TMP15]], i32 8, <8 x i1> [[TMP10]]), !alias.scope !46, !noalias !48 +; AVX512-NEXT: [[VEC_IND_NEXT_1:%.*]] = add <8 x i64> [[VEC_IND]], +; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], <8 x i64> [[VEC_IND_NEXT_1]] +; AVX512-NEXT: [[WIDE_MASKED_GATHER_2:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP16]], i32 4, <8 x i1> , <8 x i32> undef), !alias.scope !41 +; AVX512-NEXT: [[TMP17:%.*]] = icmp slt <8 x i32> [[WIDE_MASKED_GATHER_2]], +; AVX512-NEXT: [[TMP18:%.*]] = shl nuw nsw <8 x i64> [[VEC_IND_NEXT_1]], +; AVX512-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, double* [[B]], <8 x i64> [[TMP18]] +; AVX512-NEXT: [[WIDE_MASKED_GATHER20_2:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> [[TMP19]], i32 8, <8 x i1> [[TMP17]], <8 x double> undef), !alias.scope !44 +; AVX512-NEXT: [[TMP20:%.*]] = sitofp <8 x i32> [[WIDE_MASKED_GATHER_2]] to <8 x double> +; AVX512-NEXT: [[TMP21:%.*]] = fadd <8 x double> [[WIDE_MASKED_GATHER20_2]], [[TMP20]] +; AVX512-NEXT: [[TMP22:%.*]] = getelementptr inbounds double, double* [[A]], <8 x i64> [[VEC_IND_NEXT_1]] +; AVX512-NEXT: call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> [[TMP21]], <8 x double*> [[TMP22]], i32 8, <8 x i1> [[TMP17]]), !alias.scope !46, !noalias !48 +; AVX512-NEXT: [[INDEX_NEXT_2]] = add nuw nsw i64 [[INDEX]], 24 +; AVX512-NEXT: [[VEC_IND_NEXT_2]] = add <8 x i64> [[VEC_IND]], +; AVX512-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT_2]], 624 +; AVX512-NEXT: br i1 [[TMP23]], label [[FOR_BODY_PREHEADER]], label [[VECTOR_BODY]], !llvm.loop !49 +; AVX512: for.body.preheader: +; AVX512-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ 9984, [[VECTOR_BODY]] ] +; AVX512-NEXT: [[TMP24:%.*]] = sub nsw i64 9999, [[INDVARS_IV_PH]] +; AVX512-NEXT: br label [[FOR_BODY_PROL:%.*]] +; AVX512: for.body.prol: +; AVX512-NEXT: [[INDVARS_IV_PROL:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_PROL:%.*]], [[FOR_INC_PROL:%.*]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ] +; AVX512-NEXT: [[PROL_ITER:%.*]] = phi i64 [ [[PROL_ITER_SUB:%.*]], [[FOR_INC_PROL]] ], [ 1, [[FOR_BODY_PREHEADER]] ] +; AVX512-NEXT: [[ARRAYIDX_PROL:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_PROL]] +; AVX512-NEXT: [[TMP25:%.*]] = load i32, i32* [[ARRAYIDX_PROL]], align 4 +; AVX512-NEXT: [[CMP1_PROL:%.*]] = icmp slt i32 [[TMP25]], 100 +; AVX512-NEXT: br i1 [[CMP1_PROL]], label [[IF_THEN_PROL:%.*]], label [[FOR_INC_PROL]] +; AVX512: if.then.prol: +; AVX512-NEXT: [[TMP26:%.*]] = shl nuw nsw i64 [[INDVARS_IV_PROL]], 1 +; AVX512-NEXT: [[ARRAYIDX3_PROL:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP26]] +; AVX512-NEXT: [[TMP27:%.*]] = load double, double* [[ARRAYIDX3_PROL]], align 8 +; AVX512-NEXT: [[CONV_PROL:%.*]] = sitofp i32 [[TMP25]] to double +; AVX512-NEXT: [[ADD_PROL:%.*]] = fadd double [[TMP27]], [[CONV_PROL]] +; AVX512-NEXT: [[ARRAYIDX7_PROL:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_PROL]] +; AVX512-NEXT: store double [[ADD_PROL]], double* [[ARRAYIDX7_PROL]], align 8 +; AVX512-NEXT: br label [[FOR_INC_PROL]] +; AVX512: for.inc.prol: +; AVX512-NEXT: [[INDVARS_IV_NEXT_PROL]] = add nuw nsw i64 [[INDVARS_IV_PROL]], 16 +; AVX512-NEXT: [[PROL_ITER_SUB]] = add i64 [[PROL_ITER]], -1 +; AVX512-NEXT: [[PROL_ITER_CMP:%.*]] = icmp eq i64 [[PROL_ITER_SUB]], 0 +; AVX512-NEXT: br i1 [[PROL_ITER_CMP]], label [[FOR_BODY_PROL_LOOPEXIT:%.*]], label [[FOR_BODY_PROL]], !llvm.loop !50 +; AVX512: for.body.prol.loopexit: +; AVX512-NEXT: [[DOTMASK:%.*]] = and i64 [[TMP24]], 9984 +; AVX512-NEXT: [[TMP28:%.*]] = icmp eq i64 [[DOTMASK]], 0 +; AVX512-NEXT: br i1 [[TMP28]], label [[FOR_END:%.*]], label [[FOR_BODY:%.*]] +; AVX512: for.body: +; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ], [ [[INDVARS_IV_NEXT_PROL]], [[FOR_BODY_PROL_LOOPEXIT]] ] +; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] +; AVX512-NEXT: [[TMP29:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; AVX512-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP29]], 100 +; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]] +; AVX512: if.then: +; AVX512-NEXT: [[TMP30:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1 +; AVX512-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP30]] +; AVX512-NEXT: [[TMP31:%.*]] = load double, double* [[ARRAYIDX3]], align 8 +; AVX512-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP29]] to double +; AVX512-NEXT: [[ADD:%.*]] = fadd double [[TMP31]], [[CONV]] +; AVX512-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV]] +; AVX512-NEXT: store double [[ADD]], double* [[ARRAYIDX7]], align 8 +; AVX512-NEXT: br label [[FOR_INC]] +; AVX512: for.inc: +; AVX512-NEXT: [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 16 +; AVX512-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]] +; AVX512-NEXT: [[TMP32:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 +; AVX512-NEXT: [[CMP1_1:%.*]] = icmp slt i32 [[TMP32]], 100 +; AVX512-NEXT: br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]] +; AVX512: for.end: +; AVX512-NEXT: ret void +; AVX512: if.then.1: +; AVX512-NEXT: [[TMP33:%.*]] = shl nuw nsw i64 [[INDVARS_IV_NEXT]], 1 +; AVX512-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP33]] +; AVX512-NEXT: [[TMP34:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8 +; AVX512-NEXT: [[CONV_1:%.*]] = sitofp i32 [[TMP32]] to double +; AVX512-NEXT: [[ADD_1:%.*]] = fadd double [[TMP34]], [[CONV_1]] +; AVX512-NEXT: [[ARRAYIDX7_1:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT]] +; AVX512-NEXT: store double [[ADD_1]], double* [[ARRAYIDX7_1]], align 8 +; AVX512-NEXT: br label [[FOR_INC_1]] +; AVX512: for.inc.1: +; AVX512-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = add nsw i64 [[INDVARS_IV]], 32 +; AVX512-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]] +; AVX512-NEXT: [[TMP35:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4 +; AVX512-NEXT: [[CMP1_2:%.*]] = icmp slt i32 [[TMP35]], 100 +; AVX512-NEXT: br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]] +; AVX512: if.then.2: +; AVX512-NEXT: [[TMP36:%.*]] = shl nuw nsw i64 [[INDVARS_IV_NEXT_1]], 1 +; AVX512-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP36]] +; AVX512-NEXT: [[TMP37:%.*]] = load double, double* [[ARRAYIDX3_2]], align 8 +; AVX512-NEXT: [[CONV_2:%.*]] = sitofp i32 [[TMP35]] to double +; AVX512-NEXT: [[ADD_2:%.*]] = fadd double [[TMP37]], [[CONV_2]] +; AVX512-NEXT: [[ARRAYIDX7_2:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT_1]] +; AVX512-NEXT: store double [[ADD_2]], double* [[ARRAYIDX7_2]], align 8 +; AVX512-NEXT: br label [[FOR_INC_2]] +; AVX512: for.inc.2: +; AVX512-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = add nsw i64 [[INDVARS_IV]], 48 +; AVX512-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]] +; AVX512-NEXT: [[TMP38:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4 +; AVX512-NEXT: [[CMP1_3:%.*]] = icmp slt i32 [[TMP38]], 100 +; AVX512-NEXT: br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]] +; AVX512: if.then.3: +; AVX512-NEXT: [[TMP39:%.*]] = shl nuw nsw i64 [[INDVARS_IV_NEXT_2]], 1 +; AVX512-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP39]] +; AVX512-NEXT: [[TMP40:%.*]] = load double, double* [[ARRAYIDX3_3]], align 8 +; AVX512-NEXT: [[CONV_3:%.*]] = sitofp i32 [[TMP38]] to double +; AVX512-NEXT: [[ADD_3:%.*]] = fadd double [[TMP40]], [[CONV_3]] +; AVX512-NEXT: [[ARRAYIDX7_3:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[INDVARS_IV_NEXT_2]] +; AVX512-NEXT: store double [[ADD_3]], double* [[ARRAYIDX7_3]], align 8 +; AVX512-NEXT: br label [[FOR_INC_3]] +; AVX512: for.inc.3: +; AVX512-NEXT: [[INDVARS_IV_NEXT_3]] = add nsw i64 [[INDVARS_IV]], 64 +; AVX512-NEXT: [[CMP_3:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT_3]], 10000 +; AVX512-NEXT: br i1 [[CMP_3]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop !52 +; entry: %A.addr = alloca double*, align 8 %B.addr = alloca double*, align 8 @@ -423,18 +2186,158 @@ for.end: ; preds = %for.cond ; The loop here should not be vectorized due to trapping ; constant expression -;AVX-LABEL: @foo5 -;AVX-NOT: llvm.masked -;AVX: store i32 sdiv -;AVX: ret void - -;AVX512-LABEL: @foo5 -;AVX512-NOT: llvm.masked -;AVX512: store i32 sdiv -;AVX512: ret void - ; Function Attrs: nounwind uwtable define void @foo5(i32* %A, i32* %B, i32* %trigger) { +; AVX1-LABEL: @foo5( +; AVX1-NEXT: entry: +; AVX1-NEXT: br label [[FOR_BODY:%.*]] +; AVX1: for.body: +; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT_1:%.*]], [[FOR_INC_1:%.*]] ] +; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDVARS_IV]] +; AVX1-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; AVX1-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP0]], 100 +; AVX1-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]] +; AVX1: if.then: +; AVX1-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]] +; AVX1-NEXT: store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7]], align 4 +; AVX1-NEXT: br label [[FOR_INC]] +; AVX1: for.inc: +; AVX1-NEXT: [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1 +; AVX1-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]] +; AVX1-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 +; AVX1-NEXT: [[CMP1_1:%.*]] = icmp slt i32 [[TMP1]], 100 +; AVX1-NEXT: br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1]] +; AVX1: for.end: +; AVX1-NEXT: ret void +; AVX1: if.then.1: +; AVX1-NEXT: [[ARRAYIDX7_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT]] +; AVX1-NEXT: store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7_1]], align 4 +; AVX1-NEXT: br label [[FOR_INC_1]] +; AVX1: for.inc.1: +; AVX1-NEXT: [[INDVARS_IV_NEXT_1]] = add nuw nsw i64 [[INDVARS_IV]], 2 +; AVX1-NEXT: [[EXITCOND_1:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_1]], 10000 +; AVX1-NEXT: br i1 [[EXITCOND_1]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; +; AVX2-LABEL: @foo5( +; AVX2-NEXT: entry: +; AVX2-NEXT: br label [[FOR_BODY:%.*]] +; AVX2: for.body: +; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT_4:%.*]], [[FOR_INC_4:%.*]] ] +; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDVARS_IV]] +; AVX2-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; AVX2-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP0]], 100 +; AVX2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]] +; AVX2: if.then: +; AVX2-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]] +; AVX2-NEXT: store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7]], align 4 +; AVX2-NEXT: br label [[FOR_INC]] +; AVX2: for.inc: +; AVX2-NEXT: [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; AVX2-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]] +; AVX2-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 +; AVX2-NEXT: [[CMP1_1:%.*]] = icmp slt i32 [[TMP1]], 100 +; AVX2-NEXT: br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]] +; AVX2: for.end: +; AVX2-NEXT: ret void +; AVX2: if.then.1: +; AVX2-NEXT: [[ARRAYIDX7_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT]] +; AVX2-NEXT: store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7_1]], align 4 +; AVX2-NEXT: br label [[FOR_INC_1]] +; AVX2: for.inc.1: +; AVX2-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2 +; AVX2-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]] +; AVX2-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4 +; AVX2-NEXT: [[CMP1_2:%.*]] = icmp slt i32 [[TMP2]], 100 +; AVX2-NEXT: br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]] +; AVX2: if.then.2: +; AVX2-NEXT: [[ARRAYIDX7_2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_1]] +; AVX2-NEXT: store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7_2]], align 4 +; AVX2-NEXT: br label [[FOR_INC_2]] +; AVX2: for.inc.2: +; AVX2-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3 +; AVX2-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]] +; AVX2-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4 +; AVX2-NEXT: [[CMP1_3:%.*]] = icmp slt i32 [[TMP3]], 100 +; AVX2-NEXT: br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3:%.*]] +; AVX2: if.then.3: +; AVX2-NEXT: [[ARRAYIDX7_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_2]] +; AVX2-NEXT: store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7_3]], align 4 +; AVX2-NEXT: br label [[FOR_INC_3]] +; AVX2: for.inc.3: +; AVX2-NEXT: [[INDVARS_IV_NEXT_3:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 4 +; AVX2-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_3]] +; AVX2-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_4]], align 4 +; AVX2-NEXT: [[CMP1_4:%.*]] = icmp slt i32 [[TMP4]], 100 +; AVX2-NEXT: br i1 [[CMP1_4]], label [[IF_THEN_4:%.*]], label [[FOR_INC_4]] +; AVX2: if.then.4: +; AVX2-NEXT: [[ARRAYIDX7_4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_3]] +; AVX2-NEXT: store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7_4]], align 4 +; AVX2-NEXT: br label [[FOR_INC_4]] +; AVX2: for.inc.4: +; AVX2-NEXT: [[INDVARS_IV_NEXT_4]] = add nuw nsw i64 [[INDVARS_IV]], 5 +; AVX2-NEXT: [[EXITCOND_4:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_4]], 10000 +; AVX2-NEXT: br i1 [[EXITCOND_4]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; +; AVX512-LABEL: @foo5( +; AVX512-NEXT: entry: +; AVX512-NEXT: br label [[FOR_BODY:%.*]] +; AVX512: for.body: +; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT_4:%.*]], [[FOR_INC_4:%.*]] ] +; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER:%.*]], i64 [[INDVARS_IV]] +; AVX512-NEXT: [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; AVX512-NEXT: [[CMP1:%.*]] = icmp slt i32 [[TMP0]], 100 +; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]] +; AVX512: if.then: +; AVX512-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDVARS_IV]] +; AVX512-NEXT: store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7]], align 4 +; AVX512-NEXT: br label [[FOR_INC]] +; AVX512: for.inc: +; AVX512-NEXT: [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; AVX512-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]] +; AVX512-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 +; AVX512-NEXT: [[CMP1_1:%.*]] = icmp slt i32 [[TMP1]], 100 +; AVX512-NEXT: br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]] +; AVX512: for.end: +; AVX512-NEXT: ret void +; AVX512: if.then.1: +; AVX512-NEXT: [[ARRAYIDX7_1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT]] +; AVX512-NEXT: store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7_1]], align 4 +; AVX512-NEXT: br label [[FOR_INC_1]] +; AVX512: for.inc.1: +; AVX512-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2 +; AVX512-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]] +; AVX512-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4 +; AVX512-NEXT: [[CMP1_2:%.*]] = icmp slt i32 [[TMP2]], 100 +; AVX512-NEXT: br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]] +; AVX512: if.then.2: +; AVX512-NEXT: [[ARRAYIDX7_2:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_1]] +; AVX512-NEXT: store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7_2]], align 4 +; AVX512-NEXT: br label [[FOR_INC_2]] +; AVX512: for.inc.2: +; AVX512-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 3 +; AVX512-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]] +; AVX512-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4 +; AVX512-NEXT: [[CMP1_3:%.*]] = icmp slt i32 [[TMP3]], 100 +; AVX512-NEXT: br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3:%.*]] +; AVX512: if.then.3: +; AVX512-NEXT: [[ARRAYIDX7_3:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_2]] +; AVX512-NEXT: store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7_3]], align 4 +; AVX512-NEXT: br label [[FOR_INC_3]] +; AVX512: for.inc.3: +; AVX512-NEXT: [[INDVARS_IV_NEXT_3:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 4 +; AVX512-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_3]] +; AVX512-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_4]], align 4 +; AVX512-NEXT: [[CMP1_4:%.*]] = icmp slt i32 [[TMP4]], 100 +; AVX512-NEXT: br i1 [[CMP1_4]], label [[IF_THEN_4:%.*]], label [[FOR_INC_4]] +; AVX512: if.then.4: +; AVX512-NEXT: [[ARRAYIDX7_4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV_NEXT_3]] +; AVX512-NEXT: store i32 sdiv (i32 1, i32 zext (i1 icmp eq (i32** getelementptr inbounds ([1 x i32*], [1 x i32*]* @a, i64 1, i64 0), i32** @c) to i32)), i32* [[ARRAYIDX7_4]], align 4 +; AVX512-NEXT: br label [[FOR_INC_4]] +; AVX512: for.inc.4: +; AVX512-NEXT: [[INDVARS_IV_NEXT_4]] = add nuw nsw i64 [[INDVARS_IV]], 5 +; AVX512-NEXT: [[EXITCOND_4:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_4]], 10000 +; AVX512-NEXT: br i1 [[EXITCOND_4]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; entry: %A.addr = alloca i32*, align 8 %B.addr = alloca i32*, align 8 @@ -501,24 +2404,415 @@ for.end: ; preds = %for.cond ; } ; } ;} -;AVX2-LABEL: @foo6 -;AVX2: icmp sgt <4 x i32> %reverse, zeroinitializer -;AVX2: shufflevector <4 x i1>{{.*}}<4 x i32> -;AVX2: call <4 x double> @llvm.masked.load.v4f64.p0v4f64 -;AVX2: fadd <4 x double> -;AVX2: call void @llvm.masked.store.v4f64.p0v4f64 -;AVX2: ret void - -;AVX512-LABEL: @foo6 -;AVX512: icmp sgt <8 x i32> %reverse, zeroinitializer -;AVX512: shufflevector <8 x i1>{{.*}}<8 x i32> @llvm.masked.load.v8f64.p0v8f64 -;AVX512: fadd <8 x double> -;AVX512: call void @llvm.masked.store.v8f64.p0v8f64 -;AVX512: ret void define void @foo6(double* %in, double* %out, i32 %size, i32* %trigger) { +; AVX1-LABEL: @foo6( +; AVX1-NEXT: entry: +; AVX1-NEXT: [[SCEVGEP:%.*]] = getelementptr double, double* [[OUT:%.*]], i64 4096 +; AVX1-NEXT: [[SCEVGEP9:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 4096 +; AVX1-NEXT: [[SCEVGEP12:%.*]] = getelementptr double, double* [[IN:%.*]], i64 4096 +; AVX1-NEXT: [[TMP0:%.*]] = bitcast i32* [[SCEVGEP9]] to double* +; AVX1-NEXT: [[BOUND0:%.*]] = icmp ugt double* [[TMP0]], [[OUT]] +; AVX1-NEXT: [[TMP1:%.*]] = bitcast double* [[SCEVGEP]] to i32* +; AVX1-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[TRIGGER]] +; AVX1-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; AVX1-NEXT: [[BOUND014:%.*]] = icmp ugt double* [[SCEVGEP12]], [[OUT]] +; AVX1-NEXT: [[BOUND115:%.*]] = icmp ugt double* [[SCEVGEP]], [[IN]] +; AVX1-NEXT: [[FOUND_CONFLICT16:%.*]] = and i1 [[BOUND014]], [[BOUND115]] +; AVX1-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT16]] +; AVX1-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]] +; AVX1: vector.body: +; AVX1-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; AVX1-NEXT: [[OFFSET_IDX:%.*]] = sub i64 4095, [[INDEX]] +; AVX1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[OFFSET_IDX]] +; AVX1-NEXT: [[TMP3:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -3 +; AVX1-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* +; AVX1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4, !alias.scope !41 +; AVX1-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> undef, <4 x i32> +; AVX1-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -4 +; AVX1-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP5]], i64 -3 +; AVX1-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* +; AVX1-NEXT: [[WIDE_LOAD20:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4, !alias.scope !41 +; AVX1-NEXT: [[REVERSE21:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD20]], <4 x i32> undef, <4 x i32> +; AVX1-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -8 +; AVX1-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[TMP8]], i64 -3 +; AVX1-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>* +; AVX1-NEXT: [[WIDE_LOAD22:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4, !alias.scope !41 +; AVX1-NEXT: [[REVERSE23:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD22]], <4 x i32> undef, <4 x i32> +; AVX1-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -12 +; AVX1-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP11]], i64 -3 +; AVX1-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>* +; AVX1-NEXT: [[WIDE_LOAD24:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4, !alias.scope !41 +; AVX1-NEXT: [[REVERSE25:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD24]], <4 x i32> undef, <4 x i32> +; AVX1-NEXT: [[TMP14:%.*]] = icmp sgt <4 x i32> [[REVERSE]], zeroinitializer +; AVX1-NEXT: [[TMP15:%.*]] = icmp sgt <4 x i32> [[REVERSE21]], zeroinitializer +; AVX1-NEXT: [[TMP16:%.*]] = icmp sgt <4 x i32> [[REVERSE23]], zeroinitializer +; AVX1-NEXT: [[TMP17:%.*]] = icmp sgt <4 x i32> [[REVERSE25]], zeroinitializer +; AVX1-NEXT: [[TMP18:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[OFFSET_IDX]] +; AVX1-NEXT: [[TMP19:%.*]] = getelementptr double, double* [[TMP18]], i64 -3 +; AVX1-NEXT: [[REVERSE26:%.*]] = shufflevector <4 x i1> [[TMP14]], <4 x i1> undef, <4 x i32> +; AVX1-NEXT: [[TMP20:%.*]] = bitcast double* [[TMP19]] to <4 x double>* +; AVX1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP20]], i32 8, <4 x i1> [[REVERSE26]], <4 x double> undef), !alias.scope !44 +; AVX1-NEXT: [[TMP21:%.*]] = getelementptr double, double* [[TMP18]], i64 -4 +; AVX1-NEXT: [[TMP22:%.*]] = getelementptr double, double* [[TMP21]], i64 -3 +; AVX1-NEXT: [[REVERSE28:%.*]] = shufflevector <4 x i1> [[TMP15]], <4 x i1> undef, <4 x i32> +; AVX1-NEXT: [[TMP23:%.*]] = bitcast double* [[TMP22]] to <4 x double>* +; AVX1-NEXT: [[WIDE_MASKED_LOAD29:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP23]], i32 8, <4 x i1> [[REVERSE28]], <4 x double> undef), !alias.scope !44 +; AVX1-NEXT: [[TMP24:%.*]] = getelementptr double, double* [[TMP18]], i64 -8 +; AVX1-NEXT: [[TMP25:%.*]] = getelementptr double, double* [[TMP24]], i64 -3 +; AVX1-NEXT: [[REVERSE31:%.*]] = shufflevector <4 x i1> [[TMP16]], <4 x i1> undef, <4 x i32> +; AVX1-NEXT: [[TMP26:%.*]] = bitcast double* [[TMP25]] to <4 x double>* +; AVX1-NEXT: [[WIDE_MASKED_LOAD32:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP26]], i32 8, <4 x i1> [[REVERSE31]], <4 x double> undef), !alias.scope !44 +; AVX1-NEXT: [[TMP27:%.*]] = getelementptr double, double* [[TMP18]], i64 -12 +; AVX1-NEXT: [[TMP28:%.*]] = getelementptr double, double* [[TMP27]], i64 -3 +; AVX1-NEXT: [[REVERSE34:%.*]] = shufflevector <4 x i1> [[TMP17]], <4 x i1> undef, <4 x i32> +; AVX1-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP28]] to <4 x double>* +; AVX1-NEXT: [[WIDE_MASKED_LOAD35:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP29]], i32 8, <4 x i1> [[REVERSE34]], <4 x double> undef), !alias.scope !44 +; AVX1-NEXT: [[TMP30:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], +; AVX1-NEXT: [[TMP31:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD29]], +; AVX1-NEXT: [[TMP32:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD32]], +; AVX1-NEXT: [[TMP33:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD35]], +; AVX1-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[OFFSET_IDX]] +; AVX1-NEXT: [[TMP35:%.*]] = getelementptr double, double* [[TMP34]], i64 -3 +; AVX1-NEXT: [[TMP36:%.*]] = bitcast double* [[TMP35]] to <4 x double>* +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP30]], <4 x double>* [[TMP36]], i32 8, <4 x i1> [[REVERSE26]]), !alias.scope !46, !noalias !48 +; AVX1-NEXT: [[TMP37:%.*]] = getelementptr double, double* [[TMP34]], i64 -4 +; AVX1-NEXT: [[TMP38:%.*]] = getelementptr double, double* [[TMP37]], i64 -3 +; AVX1-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <4 x double>* +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP31]], <4 x double>* [[TMP39]], i32 8, <4 x i1> [[REVERSE28]]), !alias.scope !46, !noalias !48 +; AVX1-NEXT: [[TMP40:%.*]] = getelementptr double, double* [[TMP34]], i64 -8 +; AVX1-NEXT: [[TMP41:%.*]] = getelementptr double, double* [[TMP40]], i64 -3 +; AVX1-NEXT: [[TMP42:%.*]] = bitcast double* [[TMP41]] to <4 x double>* +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP32]], <4 x double>* [[TMP42]], i32 8, <4 x i1> [[REVERSE31]]), !alias.scope !46, !noalias !48 +; AVX1-NEXT: [[TMP43:%.*]] = getelementptr double, double* [[TMP34]], i64 -12 +; AVX1-NEXT: [[TMP44:%.*]] = getelementptr double, double* [[TMP43]], i64 -3 +; AVX1-NEXT: [[TMP45:%.*]] = bitcast double* [[TMP44]] to <4 x double>* +; AVX1-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP33]], <4 x double>* [[TMP45]], i32 8, <4 x i1> [[REVERSE34]]), !alias.scope !46, !noalias !48 +; AVX1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 +; AVX1-NEXT: [[TMP46:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +; AVX1-NEXT: br i1 [[TMP46]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !49 +; AVX1: for.body: +; AVX1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_1:%.*]], [[FOR_INC_1:%.*]] ], [ 4095, [[ENTRY]] ] +; AVX1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] +; AVX1-NEXT: [[TMP47:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; AVX1-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP47]], 0 +; AVX1-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]] +; AVX1: if.then: +; AVX1-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV]] +; AVX1-NEXT: [[TMP48:%.*]] = load double, double* [[ARRAYIDX3]], align 8 +; AVX1-NEXT: [[ADD:%.*]] = fadd double [[TMP48]], 5.000000e-01 +; AVX1-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]] +; AVX1-NEXT: store double [[ADD]], double* [[ARRAYIDX5]], align 8 +; AVX1-NEXT: br label [[FOR_INC]] +; AVX1: for.inc: +; AVX1-NEXT: [[INDVARS_IV_NEXT:%.*]] = add nsw i64 [[INDVARS_IV]], -1 +; AVX1-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]] +; AVX1-NEXT: [[TMP49:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 +; AVX1-NEXT: [[CMP1_1:%.*]] = icmp sgt i32 [[TMP49]], 0 +; AVX1-NEXT: br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1]] +; AVX1: for.end: +; AVX1-NEXT: ret void +; AVX1: if.then.1: +; AVX1-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV_NEXT]] +; AVX1-NEXT: [[TMP50:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8 +; AVX1-NEXT: [[ADD_1:%.*]] = fadd double [[TMP50]], 5.000000e-01 +; AVX1-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV_NEXT]] +; AVX1-NEXT: store double [[ADD_1]], double* [[ARRAYIDX5_1]], align 8 +; AVX1-NEXT: br label [[FOR_INC_1]] +; AVX1: for.inc.1: +; AVX1-NEXT: [[INDVARS_IV_NEXT_1]] = add nsw i64 [[INDVARS_IV]], -2 +; AVX1-NEXT: [[CMP_1:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 0 +; AVX1-NEXT: br i1 [[CMP_1]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !50 +; +; AVX2-LABEL: @foo6( +; AVX2-NEXT: entry: +; AVX2-NEXT: [[SCEVGEP:%.*]] = getelementptr double, double* [[OUT:%.*]], i64 4096 +; AVX2-NEXT: [[SCEVGEP9:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 4096 +; AVX2-NEXT: [[SCEVGEP12:%.*]] = getelementptr double, double* [[IN:%.*]], i64 4096 +; AVX2-NEXT: [[TMP0:%.*]] = bitcast i32* [[SCEVGEP9]] to double* +; AVX2-NEXT: [[BOUND0:%.*]] = icmp ugt double* [[TMP0]], [[OUT]] +; AVX2-NEXT: [[TMP1:%.*]] = bitcast double* [[SCEVGEP]] to i32* +; AVX2-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[TRIGGER]] +; AVX2-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; AVX2-NEXT: [[BOUND014:%.*]] = icmp ugt double* [[SCEVGEP12]], [[OUT]] +; AVX2-NEXT: [[BOUND115:%.*]] = icmp ugt double* [[SCEVGEP]], [[IN]] +; AVX2-NEXT: [[FOUND_CONFLICT16:%.*]] = and i1 [[BOUND014]], [[BOUND115]] +; AVX2-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT16]] +; AVX2-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]] +; AVX2: vector.body: +; AVX2-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; AVX2-NEXT: [[OFFSET_IDX:%.*]] = sub i64 4095, [[INDEX]] +; AVX2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[OFFSET_IDX]] +; AVX2-NEXT: [[TMP3:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -3 +; AVX2-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* +; AVX2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4, !alias.scope !41 +; AVX2-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> undef, <4 x i32> +; AVX2-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -4 +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP5]], i64 -3 +; AVX2-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* +; AVX2-NEXT: [[WIDE_LOAD20:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4, !alias.scope !41 +; AVX2-NEXT: [[REVERSE21:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD20]], <4 x i32> undef, <4 x i32> +; AVX2-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -8 +; AVX2-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[TMP8]], i64 -3 +; AVX2-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>* +; AVX2-NEXT: [[WIDE_LOAD22:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4, !alias.scope !41 +; AVX2-NEXT: [[REVERSE23:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD22]], <4 x i32> undef, <4 x i32> +; AVX2-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -12 +; AVX2-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP11]], i64 -3 +; AVX2-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>* +; AVX2-NEXT: [[WIDE_LOAD24:%.*]] = load <4 x i32>, <4 x i32>* [[TMP13]], align 4, !alias.scope !41 +; AVX2-NEXT: [[REVERSE25:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD24]], <4 x i32> undef, <4 x i32> +; AVX2-NEXT: [[TMP14:%.*]] = icmp sgt <4 x i32> [[REVERSE]], zeroinitializer +; AVX2-NEXT: [[TMP15:%.*]] = icmp sgt <4 x i32> [[REVERSE21]], zeroinitializer +; AVX2-NEXT: [[TMP16:%.*]] = icmp sgt <4 x i32> [[REVERSE23]], zeroinitializer +; AVX2-NEXT: [[TMP17:%.*]] = icmp sgt <4 x i32> [[REVERSE25]], zeroinitializer +; AVX2-NEXT: [[TMP18:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[OFFSET_IDX]] +; AVX2-NEXT: [[TMP19:%.*]] = getelementptr double, double* [[TMP18]], i64 -3 +; AVX2-NEXT: [[REVERSE26:%.*]] = shufflevector <4 x i1> [[TMP14]], <4 x i1> undef, <4 x i32> +; AVX2-NEXT: [[TMP20:%.*]] = bitcast double* [[TMP19]] to <4 x double>* +; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP20]], i32 8, <4 x i1> [[REVERSE26]], <4 x double> undef), !alias.scope !44 +; AVX2-NEXT: [[TMP21:%.*]] = getelementptr double, double* [[TMP18]], i64 -4 +; AVX2-NEXT: [[TMP22:%.*]] = getelementptr double, double* [[TMP21]], i64 -3 +; AVX2-NEXT: [[REVERSE28:%.*]] = shufflevector <4 x i1> [[TMP15]], <4 x i1> undef, <4 x i32> +; AVX2-NEXT: [[TMP23:%.*]] = bitcast double* [[TMP22]] to <4 x double>* +; AVX2-NEXT: [[WIDE_MASKED_LOAD29:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP23]], i32 8, <4 x i1> [[REVERSE28]], <4 x double> undef), !alias.scope !44 +; AVX2-NEXT: [[TMP24:%.*]] = getelementptr double, double* [[TMP18]], i64 -8 +; AVX2-NEXT: [[TMP25:%.*]] = getelementptr double, double* [[TMP24]], i64 -3 +; AVX2-NEXT: [[REVERSE31:%.*]] = shufflevector <4 x i1> [[TMP16]], <4 x i1> undef, <4 x i32> +; AVX2-NEXT: [[TMP26:%.*]] = bitcast double* [[TMP25]] to <4 x double>* +; AVX2-NEXT: [[WIDE_MASKED_LOAD32:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP26]], i32 8, <4 x i1> [[REVERSE31]], <4 x double> undef), !alias.scope !44 +; AVX2-NEXT: [[TMP27:%.*]] = getelementptr double, double* [[TMP18]], i64 -12 +; AVX2-NEXT: [[TMP28:%.*]] = getelementptr double, double* [[TMP27]], i64 -3 +; AVX2-NEXT: [[REVERSE34:%.*]] = shufflevector <4 x i1> [[TMP17]], <4 x i1> undef, <4 x i32> +; AVX2-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP28]] to <4 x double>* +; AVX2-NEXT: [[WIDE_MASKED_LOAD35:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* [[TMP29]], i32 8, <4 x i1> [[REVERSE34]], <4 x double> undef), !alias.scope !44 +; AVX2-NEXT: [[TMP30:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], +; AVX2-NEXT: [[TMP31:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD29]], +; AVX2-NEXT: [[TMP32:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD32]], +; AVX2-NEXT: [[TMP33:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD35]], +; AVX2-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[OFFSET_IDX]] +; AVX2-NEXT: [[TMP35:%.*]] = getelementptr double, double* [[TMP34]], i64 -3 +; AVX2-NEXT: [[TMP36:%.*]] = bitcast double* [[TMP35]] to <4 x double>* +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP30]], <4 x double>* [[TMP36]], i32 8, <4 x i1> [[REVERSE26]]), !alias.scope !46, !noalias !48 +; AVX2-NEXT: [[TMP37:%.*]] = getelementptr double, double* [[TMP34]], i64 -4 +; AVX2-NEXT: [[TMP38:%.*]] = getelementptr double, double* [[TMP37]], i64 -3 +; AVX2-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <4 x double>* +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP31]], <4 x double>* [[TMP39]], i32 8, <4 x i1> [[REVERSE28]]), !alias.scope !46, !noalias !48 +; AVX2-NEXT: [[TMP40:%.*]] = getelementptr double, double* [[TMP34]], i64 -8 +; AVX2-NEXT: [[TMP41:%.*]] = getelementptr double, double* [[TMP40]], i64 -3 +; AVX2-NEXT: [[TMP42:%.*]] = bitcast double* [[TMP41]] to <4 x double>* +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP32]], <4 x double>* [[TMP42]], i32 8, <4 x i1> [[REVERSE31]]), !alias.scope !46, !noalias !48 +; AVX2-NEXT: [[TMP43:%.*]] = getelementptr double, double* [[TMP34]], i64 -12 +; AVX2-NEXT: [[TMP44:%.*]] = getelementptr double, double* [[TMP43]], i64 -3 +; AVX2-NEXT: [[TMP45:%.*]] = bitcast double* [[TMP44]] to <4 x double>* +; AVX2-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP33]], <4 x double>* [[TMP45]], i32 8, <4 x i1> [[REVERSE34]]), !alias.scope !46, !noalias !48 +; AVX2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 +; AVX2-NEXT: [[TMP46:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +; AVX2-NEXT: br i1 [[TMP46]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !49 +; AVX2: for.body: +; AVX2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ], [ 4095, [[ENTRY]] ] +; AVX2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] +; AVX2-NEXT: [[TMP47:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; AVX2-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP47]], 0 +; AVX2-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]] +; AVX2: if.then: +; AVX2-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV]] +; AVX2-NEXT: [[TMP48:%.*]] = load double, double* [[ARRAYIDX3]], align 8 +; AVX2-NEXT: [[ADD:%.*]] = fadd double [[TMP48]], 5.000000e-01 +; AVX2-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]] +; AVX2-NEXT: store double [[ADD]], double* [[ARRAYIDX5]], align 8 +; AVX2-NEXT: br label [[FOR_INC]] +; AVX2: for.inc: +; AVX2-NEXT: [[INDVARS_IV_NEXT:%.*]] = add nsw i64 [[INDVARS_IV]], -1 +; AVX2-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]] +; AVX2-NEXT: [[TMP49:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 +; AVX2-NEXT: [[CMP1_1:%.*]] = icmp sgt i32 [[TMP49]], 0 +; AVX2-NEXT: br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]] +; AVX2: for.end: +; AVX2-NEXT: ret void +; AVX2: if.then.1: +; AVX2-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV_NEXT]] +; AVX2-NEXT: [[TMP50:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8 +; AVX2-NEXT: [[ADD_1:%.*]] = fadd double [[TMP50]], 5.000000e-01 +; AVX2-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV_NEXT]] +; AVX2-NEXT: store double [[ADD_1]], double* [[ARRAYIDX5_1]], align 8 +; AVX2-NEXT: br label [[FOR_INC_1]] +; AVX2: for.inc.1: +; AVX2-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = add nsw i64 [[INDVARS_IV]], -2 +; AVX2-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]] +; AVX2-NEXT: [[TMP51:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4 +; AVX2-NEXT: [[CMP1_2:%.*]] = icmp sgt i32 [[TMP51]], 0 +; AVX2-NEXT: br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]] +; AVX2: if.then.2: +; AVX2-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV_NEXT_1]] +; AVX2-NEXT: [[TMP52:%.*]] = load double, double* [[ARRAYIDX3_2]], align 8 +; AVX2-NEXT: [[ADD_2:%.*]] = fadd double [[TMP52]], 5.000000e-01 +; AVX2-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV_NEXT_1]] +; AVX2-NEXT: store double [[ADD_2]], double* [[ARRAYIDX5_2]], align 8 +; AVX2-NEXT: br label [[FOR_INC_2]] +; AVX2: for.inc.2: +; AVX2-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = add nsw i64 [[INDVARS_IV]], -3 +; AVX2-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]] +; AVX2-NEXT: [[TMP53:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4 +; AVX2-NEXT: [[CMP1_3:%.*]] = icmp sgt i32 [[TMP53]], 0 +; AVX2-NEXT: br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]] +; AVX2: if.then.3: +; AVX2-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV_NEXT_2]] +; AVX2-NEXT: [[TMP54:%.*]] = load double, double* [[ARRAYIDX3_3]], align 8 +; AVX2-NEXT: [[ADD_3:%.*]] = fadd double [[TMP54]], 5.000000e-01 +; AVX2-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV_NEXT_2]] +; AVX2-NEXT: store double [[ADD_3]], double* [[ARRAYIDX5_3]], align 8 +; AVX2-NEXT: br label [[FOR_INC_3]] +; AVX2: for.inc.3: +; AVX2-NEXT: [[INDVARS_IV_NEXT_3]] = add nsw i64 [[INDVARS_IV]], -4 +; AVX2-NEXT: [[CMP_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_2]], 0 +; AVX2-NEXT: br i1 [[CMP_3]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !50 +; +; AVX512-LABEL: @foo6( +; AVX512-NEXT: entry: +; AVX512-NEXT: [[SCEVGEP:%.*]] = getelementptr double, double* [[OUT:%.*]], i64 4096 +; AVX512-NEXT: [[SCEVGEP9:%.*]] = getelementptr i32, i32* [[TRIGGER:%.*]], i64 4096 +; AVX512-NEXT: [[SCEVGEP12:%.*]] = getelementptr double, double* [[IN:%.*]], i64 4096 +; AVX512-NEXT: [[TMP0:%.*]] = bitcast i32* [[SCEVGEP9]] to double* +; AVX512-NEXT: [[BOUND0:%.*]] = icmp ugt double* [[TMP0]], [[OUT]] +; AVX512-NEXT: [[TMP1:%.*]] = bitcast double* [[SCEVGEP]] to i32* +; AVX512-NEXT: [[BOUND1:%.*]] = icmp ugt i32* [[TMP1]], [[TRIGGER]] +; AVX512-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] +; AVX512-NEXT: [[BOUND014:%.*]] = icmp ugt double* [[SCEVGEP12]], [[OUT]] +; AVX512-NEXT: [[BOUND115:%.*]] = icmp ugt double* [[SCEVGEP]], [[IN]] +; AVX512-NEXT: [[FOUND_CONFLICT16:%.*]] = and i1 [[BOUND014]], [[BOUND115]] +; AVX512-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT16]] +; AVX512-NEXT: br i1 [[CONFLICT_RDX]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]] +; AVX512: vector.body: +; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; AVX512-NEXT: [[OFFSET_IDX:%.*]] = sub i64 4095, [[INDEX]] +; AVX512-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[OFFSET_IDX]] +; AVX512-NEXT: [[TMP3:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -7 +; AVX512-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <8 x i32>* +; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP4]], align 4, !alias.scope !53 +; AVX512-NEXT: [[REVERSE:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD]], <8 x i32> undef, <8 x i32> +; AVX512-NEXT: [[TMP5:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -8 +; AVX512-NEXT: [[TMP6:%.*]] = getelementptr i32, i32* [[TMP5]], i64 -7 +; AVX512-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>* +; AVX512-NEXT: [[WIDE_LOAD20:%.*]] = load <8 x i32>, <8 x i32>* [[TMP7]], align 4, !alias.scope !53 +; AVX512-NEXT: [[REVERSE21:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD20]], <8 x i32> undef, <8 x i32> +; AVX512-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -16 +; AVX512-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[TMP8]], i64 -7 +; AVX512-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <8 x i32>* +; AVX512-NEXT: [[WIDE_LOAD22:%.*]] = load <8 x i32>, <8 x i32>* [[TMP10]], align 4, !alias.scope !53 +; AVX512-NEXT: [[REVERSE23:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD22]], <8 x i32> undef, <8 x i32> +; AVX512-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[TMP2]], i64 -24 +; AVX512-NEXT: [[TMP12:%.*]] = getelementptr i32, i32* [[TMP11]], i64 -7 +; AVX512-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>* +; AVX512-NEXT: [[WIDE_LOAD24:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4, !alias.scope !53 +; AVX512-NEXT: [[REVERSE25:%.*]] = shufflevector <8 x i32> [[WIDE_LOAD24]], <8 x i32> undef, <8 x i32> +; AVX512-NEXT: [[TMP14:%.*]] = icmp sgt <8 x i32> [[REVERSE]], zeroinitializer +; AVX512-NEXT: [[TMP15:%.*]] = icmp sgt <8 x i32> [[REVERSE21]], zeroinitializer +; AVX512-NEXT: [[TMP16:%.*]] = icmp sgt <8 x i32> [[REVERSE23]], zeroinitializer +; AVX512-NEXT: [[TMP17:%.*]] = icmp sgt <8 x i32> [[REVERSE25]], zeroinitializer +; AVX512-NEXT: [[TMP18:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[OFFSET_IDX]] +; AVX512-NEXT: [[TMP19:%.*]] = getelementptr double, double* [[TMP18]], i64 -7 +; AVX512-NEXT: [[REVERSE26:%.*]] = shufflevector <8 x i1> [[TMP14]], <8 x i1> undef, <8 x i32> +; AVX512-NEXT: [[TMP20:%.*]] = bitcast double* [[TMP19]] to <8 x double>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP20]], i32 8, <8 x i1> [[REVERSE26]], <8 x double> undef), !alias.scope !56 +; AVX512-NEXT: [[TMP21:%.*]] = getelementptr double, double* [[TMP18]], i64 -8 +; AVX512-NEXT: [[TMP22:%.*]] = getelementptr double, double* [[TMP21]], i64 -7 +; AVX512-NEXT: [[REVERSE28:%.*]] = shufflevector <8 x i1> [[TMP15]], <8 x i1> undef, <8 x i32> +; AVX512-NEXT: [[TMP23:%.*]] = bitcast double* [[TMP22]] to <8 x double>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD29:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP23]], i32 8, <8 x i1> [[REVERSE28]], <8 x double> undef), !alias.scope !56 +; AVX512-NEXT: [[TMP24:%.*]] = getelementptr double, double* [[TMP18]], i64 -16 +; AVX512-NEXT: [[TMP25:%.*]] = getelementptr double, double* [[TMP24]], i64 -7 +; AVX512-NEXT: [[REVERSE31:%.*]] = shufflevector <8 x i1> [[TMP16]], <8 x i1> undef, <8 x i32> +; AVX512-NEXT: [[TMP26:%.*]] = bitcast double* [[TMP25]] to <8 x double>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD32:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP26]], i32 8, <8 x i1> [[REVERSE31]], <8 x double> undef), !alias.scope !56 +; AVX512-NEXT: [[TMP27:%.*]] = getelementptr double, double* [[TMP18]], i64 -24 +; AVX512-NEXT: [[TMP28:%.*]] = getelementptr double, double* [[TMP27]], i64 -7 +; AVX512-NEXT: [[REVERSE34:%.*]] = shufflevector <8 x i1> [[TMP17]], <8 x i1> undef, <8 x i32> +; AVX512-NEXT: [[TMP29:%.*]] = bitcast double* [[TMP28]] to <8 x double>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD35:%.*]] = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* [[TMP29]], i32 8, <8 x i1> [[REVERSE34]], <8 x double> undef), !alias.scope !56 +; AVX512-NEXT: [[TMP30:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD]], +; AVX512-NEXT: [[TMP31:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD29]], +; AVX512-NEXT: [[TMP32:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD32]], +; AVX512-NEXT: [[TMP33:%.*]] = fadd <8 x double> [[WIDE_MASKED_LOAD35]], +; AVX512-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[OFFSET_IDX]] +; AVX512-NEXT: [[TMP35:%.*]] = getelementptr double, double* [[TMP34]], i64 -7 +; AVX512-NEXT: [[TMP36:%.*]] = bitcast double* [[TMP35]] to <8 x double>* +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP30]], <8 x double>* [[TMP36]], i32 8, <8 x i1> [[REVERSE26]]), !alias.scope !58, !noalias !60 +; AVX512-NEXT: [[TMP37:%.*]] = getelementptr double, double* [[TMP34]], i64 -8 +; AVX512-NEXT: [[TMP38:%.*]] = getelementptr double, double* [[TMP37]], i64 -7 +; AVX512-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <8 x double>* +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP31]], <8 x double>* [[TMP39]], i32 8, <8 x i1> [[REVERSE28]]), !alias.scope !58, !noalias !60 +; AVX512-NEXT: [[TMP40:%.*]] = getelementptr double, double* [[TMP34]], i64 -16 +; AVX512-NEXT: [[TMP41:%.*]] = getelementptr double, double* [[TMP40]], i64 -7 +; AVX512-NEXT: [[TMP42:%.*]] = bitcast double* [[TMP41]] to <8 x double>* +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP32]], <8 x double>* [[TMP42]], i32 8, <8 x i1> [[REVERSE31]]), !alias.scope !58, !noalias !60 +; AVX512-NEXT: [[TMP43:%.*]] = getelementptr double, double* [[TMP34]], i64 -24 +; AVX512-NEXT: [[TMP44:%.*]] = getelementptr double, double* [[TMP43]], i64 -7 +; AVX512-NEXT: [[TMP45:%.*]] = bitcast double* [[TMP44]] to <8 x double>* +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> [[TMP33]], <8 x double>* [[TMP45]], i32 8, <8 x i1> [[REVERSE34]]), !alias.scope !58, !noalias !60 +; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 +; AVX512-NEXT: [[TMP46:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4096 +; AVX512-NEXT: br i1 [[TMP46]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop !61 +; AVX512: for.body: +; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_INC_3:%.*]] ], [ 4095, [[ENTRY]] ] +; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV]] +; AVX512-NEXT: [[TMP47:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; AVX512-NEXT: [[CMP1:%.*]] = icmp sgt i32 [[TMP47]], 0 +; AVX512-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC:%.*]] +; AVX512: if.then: +; AVX512-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV]] +; AVX512-NEXT: [[TMP48:%.*]] = load double, double* [[ARRAYIDX3]], align 8 +; AVX512-NEXT: [[ADD:%.*]] = fadd double [[TMP48]], 5.000000e-01 +; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]] +; AVX512-NEXT: store double [[ADD]], double* [[ARRAYIDX5]], align 8 +; AVX512-NEXT: br label [[FOR_INC]] +; AVX512: for.inc: +; AVX512-NEXT: [[INDVARS_IV_NEXT:%.*]] = add nsw i64 [[INDVARS_IV]], -1 +; AVX512-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT]] +; AVX512-NEXT: [[TMP49:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 +; AVX512-NEXT: [[CMP1_1:%.*]] = icmp sgt i32 [[TMP49]], 0 +; AVX512-NEXT: br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[FOR_INC_1:%.*]] +; AVX512: for.end: +; AVX512-NEXT: ret void +; AVX512: if.then.1: +; AVX512-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV_NEXT]] +; AVX512-NEXT: [[TMP50:%.*]] = load double, double* [[ARRAYIDX3_1]], align 8 +; AVX512-NEXT: [[ADD_1:%.*]] = fadd double [[TMP50]], 5.000000e-01 +; AVX512-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV_NEXT]] +; AVX512-NEXT: store double [[ADD_1]], double* [[ARRAYIDX5_1]], align 8 +; AVX512-NEXT: br label [[FOR_INC_1]] +; AVX512: for.inc.1: +; AVX512-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = add nsw i64 [[INDVARS_IV]], -2 +; AVX512-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_1]] +; AVX512-NEXT: [[TMP51:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4 +; AVX512-NEXT: [[CMP1_2:%.*]] = icmp sgt i32 [[TMP51]], 0 +; AVX512-NEXT: br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[FOR_INC_2:%.*]] +; AVX512: if.then.2: +; AVX512-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV_NEXT_1]] +; AVX512-NEXT: [[TMP52:%.*]] = load double, double* [[ARRAYIDX3_2]], align 8 +; AVX512-NEXT: [[ADD_2:%.*]] = fadd double [[TMP52]], 5.000000e-01 +; AVX512-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV_NEXT_1]] +; AVX512-NEXT: store double [[ADD_2]], double* [[ARRAYIDX5_2]], align 8 +; AVX512-NEXT: br label [[FOR_INC_2]] +; AVX512: for.inc.2: +; AVX512-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = add nsw i64 [[INDVARS_IV]], -3 +; AVX512-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[TRIGGER]], i64 [[INDVARS_IV_NEXT_2]] +; AVX512-NEXT: [[TMP53:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4 +; AVX512-NEXT: [[CMP1_3:%.*]] = icmp sgt i32 [[TMP53]], 0 +; AVX512-NEXT: br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[FOR_INC_3]] +; AVX512: if.then.3: +; AVX512-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr inbounds double, double* [[IN]], i64 [[INDVARS_IV_NEXT_2]] +; AVX512-NEXT: [[TMP54:%.*]] = load double, double* [[ARRAYIDX3_3]], align 8 +; AVX512-NEXT: [[ADD_3:%.*]] = fadd double [[TMP54]], 5.000000e-01 +; AVX512-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV_NEXT_2]] +; AVX512-NEXT: store double [[ADD_3]], double* [[ARRAYIDX5_3]], align 8 +; AVX512-NEXT: br label [[FOR_INC_3]] +; AVX512: for.inc.3: +; AVX512-NEXT: [[INDVARS_IV_NEXT_3]] = add nsw i64 [[INDVARS_IV]], -4 +; AVX512-NEXT: [[CMP_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_2]], 0 +; AVX512-NEXT: br i1 [[CMP_3]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !62 +; entry: %in.addr = alloca double*, align 8 %out.addr = alloca double*, align 8 @@ -581,12 +2875,201 @@ for.end: ; preds = %for.cond ; out[i] = (double) 0.5; ; } -;AVX512-LABEL: @foo7 -;AVX512: call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* -;AVX512: call void @llvm.masked.store.v8f64.p0v8f64 -;AVX512: ret void - define void @foo7(double* noalias %out, double** noalias %in, i8* noalias %trigger, i32 %size) #0 { +; AVX-LABEL: @foo7( +; AVX-NEXT: entry: +; AVX-NEXT: [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0 +; AVX-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]] +; AVX: for.body.preheader: +; AVX-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64 +; AVX-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[SIZE]], 16 +; AVX-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER16:%.*]], label [[VECTOR_PH:%.*]] +; AVX: vector.ph: +; AVX-NEXT: [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967280 +; AVX-NEXT: br label [[VECTOR_BODY:%.*]] +; AVX: vector.body: +; AVX-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; AVX-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[INDEX]] +; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i8>* +; AVX-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 +; AVX-NEXT: [[TMP2:%.*]] = getelementptr i8, i8* [[TMP0]], i64 4 +; AVX-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>* +; AVX-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 1 +; AVX-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* [[TMP0]], i64 8 +; AVX-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <4 x i8>* +; AVX-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i8>, <4 x i8>* [[TMP5]], align 1 +; AVX-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[TMP0]], i64 12 +; AVX-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>* +; AVX-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i8>, <4 x i8>* [[TMP7]], align 1 +; AVX-NEXT: [[TMP8:%.*]] = and <4 x i8> [[WIDE_LOAD]], +; AVX-NEXT: [[TMP9:%.*]] = and <4 x i8> [[WIDE_LOAD10]], +; AVX-NEXT: [[TMP10:%.*]] = and <4 x i8> [[WIDE_LOAD11]], +; AVX-NEXT: [[TMP11:%.*]] = and <4 x i8> [[WIDE_LOAD12]], +; AVX-NEXT: [[TMP12:%.*]] = icmp ne <4 x i8> [[TMP8]], zeroinitializer +; AVX-NEXT: [[TMP13:%.*]] = icmp ne <4 x i8> [[TMP9]], zeroinitializer +; AVX-NEXT: [[TMP14:%.*]] = icmp ne <4 x i8> [[TMP10]], zeroinitializer +; AVX-NEXT: [[TMP15:%.*]] = icmp ne <4 x i8> [[TMP11]], zeroinitializer +; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 [[INDEX]] +; AVX-NEXT: [[TMP17:%.*]] = bitcast double** [[TMP16]] to <4 x double*>* +; AVX-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP17]], i32 8, <4 x i1> [[TMP12]], <4 x double*> undef) +; AVX-NEXT: [[TMP18:%.*]] = getelementptr double*, double** [[TMP16]], i64 4 +; AVX-NEXT: [[TMP19:%.*]] = bitcast double** [[TMP18]] to <4 x double*>* +; AVX-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP19]], i32 8, <4 x i1> [[TMP13]], <4 x double*> undef) +; AVX-NEXT: [[TMP20:%.*]] = getelementptr double*, double** [[TMP16]], i64 8 +; AVX-NEXT: [[TMP21:%.*]] = bitcast double** [[TMP20]] to <4 x double*>* +; AVX-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP21]], i32 8, <4 x i1> [[TMP14]], <4 x double*> undef) +; AVX-NEXT: [[TMP22:%.*]] = getelementptr double*, double** [[TMP16]], i64 12 +; AVX-NEXT: [[TMP23:%.*]] = bitcast double** [[TMP22]] to <4 x double*>* +; AVX-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x double*> @llvm.masked.load.v4p0f64.p0v4p0f64(<4 x double*>* [[TMP23]], i32 8, <4 x i1> [[TMP15]], <4 x double*> undef) +; AVX-NEXT: [[TMP24:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer +; AVX-NEXT: [[TMP25:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD13]], zeroinitializer +; AVX-NEXT: [[TMP26:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD14]], zeroinitializer +; AVX-NEXT: [[TMP27:%.*]] = icmp ne <4 x double*> [[WIDE_MASKED_LOAD15]], zeroinitializer +; AVX-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]] +; AVX-NEXT: [[TMP29:%.*]] = and <4 x i1> [[TMP24]], [[TMP12]] +; AVX-NEXT: [[TMP30:%.*]] = and <4 x i1> [[TMP25]], [[TMP13]] +; AVX-NEXT: [[TMP31:%.*]] = and <4 x i1> [[TMP26]], [[TMP14]] +; AVX-NEXT: [[TMP32:%.*]] = and <4 x i1> [[TMP27]], [[TMP15]] +; AVX-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP28]] to <4 x double>* +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP29]]) +; AVX-NEXT: [[TMP34:%.*]] = getelementptr double, double* [[TMP28]], i64 4 +; AVX-NEXT: [[TMP35:%.*]] = bitcast double* [[TMP34]] to <4 x double>* +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP30]]) +; AVX-NEXT: [[TMP36:%.*]] = getelementptr double, double* [[TMP28]], i64 8 +; AVX-NEXT: [[TMP37:%.*]] = bitcast double* [[TMP36]] to <4 x double>* +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP37]], i32 8, <4 x i1> [[TMP31]]) +; AVX-NEXT: [[TMP38:%.*]] = getelementptr double, double* [[TMP28]], i64 12 +; AVX-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <4 x double>* +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP39]], i32 8, <4 x i1> [[TMP32]]) +; AVX-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 +; AVX-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; AVX-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !51 +; AVX: middle.block: +; AVX-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]] +; AVX-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER16]] +; AVX: for.body.preheader16: +; AVX-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] +; AVX-NEXT: br label [[FOR_BODY:%.*]] +; AVX: for.body: +; AVX-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER16]] ] +; AVX-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]] +; AVX-NEXT: [[TMP41:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; AVX-NEXT: [[TMP42:%.*]] = and i8 [[TMP41]], 1 +; AVX-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP42]], 0 +; AVX-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]] +; AVX: land.lhs.true: +; AVX-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[INDVARS_IV]] +; AVX-NEXT: [[TMP43:%.*]] = load double*, double** [[ARRAYIDX2]], align 8 +; AVX-NEXT: [[CMP3:%.*]] = icmp eq double* [[TMP43]], null +; AVX-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]] +; AVX: if.then: +; AVX-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]] +; AVX-NEXT: store double 5.000000e-01, double* [[ARRAYIDX5]], align 8 +; AVX-NEXT: br label [[FOR_INC]] +; AVX: for.inc: +; AVX-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; AVX-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; AVX-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !52 +; AVX: for.end: +; AVX-NEXT: ret void +; +; AVX512-LABEL: @foo7( +; AVX512-NEXT: entry: +; AVX512-NEXT: [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0 +; AVX512-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]] +; AVX512: for.body.preheader: +; AVX512-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64 +; AVX512-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[SIZE]], 32 +; AVX512-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER16:%.*]], label [[VECTOR_PH:%.*]] +; AVX512: vector.ph: +; AVX512-NEXT: [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967264 +; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] +; AVX512: vector.body: +; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[INDEX]] +; AVX512-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>* +; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 +; AVX512-NEXT: [[TMP2:%.*]] = getelementptr i8, i8* [[TMP0]], i64 8 +; AVX512-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <8 x i8>* +; AVX512-NEXT: [[WIDE_LOAD10:%.*]] = load <8 x i8>, <8 x i8>* [[TMP3]], align 1 +; AVX512-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* [[TMP0]], i64 16 +; AVX512-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <8 x i8>* +; AVX512-NEXT: [[WIDE_LOAD11:%.*]] = load <8 x i8>, <8 x i8>* [[TMP5]], align 1 +; AVX512-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[TMP0]], i64 24 +; AVX512-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <8 x i8>* +; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i8>, <8 x i8>* [[TMP7]], align 1 +; AVX512-NEXT: [[TMP8:%.*]] = and <8 x i8> [[WIDE_LOAD]], +; AVX512-NEXT: [[TMP9:%.*]] = and <8 x i8> [[WIDE_LOAD10]], +; AVX512-NEXT: [[TMP10:%.*]] = and <8 x i8> [[WIDE_LOAD11]], +; AVX512-NEXT: [[TMP11:%.*]] = and <8 x i8> [[WIDE_LOAD12]], +; AVX512-NEXT: [[TMP12:%.*]] = icmp ne <8 x i8> [[TMP8]], zeroinitializer +; AVX512-NEXT: [[TMP13:%.*]] = icmp ne <8 x i8> [[TMP9]], zeroinitializer +; AVX512-NEXT: [[TMP14:%.*]] = icmp ne <8 x i8> [[TMP10]], zeroinitializer +; AVX512-NEXT: [[TMP15:%.*]] = icmp ne <8 x i8> [[TMP11]], zeroinitializer +; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds double*, double** [[IN:%.*]], i64 [[INDEX]] +; AVX512-NEXT: [[TMP17:%.*]] = bitcast double** [[TMP16]] to <8 x double*>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP17]], i32 8, <8 x i1> [[TMP12]], <8 x double*> undef) +; AVX512-NEXT: [[TMP18:%.*]] = getelementptr double*, double** [[TMP16]], i64 8 +; AVX512-NEXT: [[TMP19:%.*]] = bitcast double** [[TMP18]] to <8 x double*>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP19]], i32 8, <8 x i1> [[TMP13]], <8 x double*> undef) +; AVX512-NEXT: [[TMP20:%.*]] = getelementptr double*, double** [[TMP16]], i64 16 +; AVX512-NEXT: [[TMP21:%.*]] = bitcast double** [[TMP20]] to <8 x double*>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP21]], i32 8, <8 x i1> [[TMP14]], <8 x double*> undef) +; AVX512-NEXT: [[TMP22:%.*]] = getelementptr double*, double** [[TMP16]], i64 24 +; AVX512-NEXT: [[TMP23:%.*]] = bitcast double** [[TMP22]] to <8 x double*>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x double*> @llvm.masked.load.v8p0f64.p0v8p0f64(<8 x double*>* [[TMP23]], i32 8, <8 x i1> [[TMP15]], <8 x double*> undef) +; AVX512-NEXT: [[TMP24:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD]], zeroinitializer +; AVX512-NEXT: [[TMP25:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD13]], zeroinitializer +; AVX512-NEXT: [[TMP26:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD14]], zeroinitializer +; AVX512-NEXT: [[TMP27:%.*]] = icmp ne <8 x double*> [[WIDE_MASKED_LOAD15]], zeroinitializer +; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]] +; AVX512-NEXT: [[TMP29:%.*]] = and <8 x i1> [[TMP24]], [[TMP12]] +; AVX512-NEXT: [[TMP30:%.*]] = and <8 x i1> [[TMP25]], [[TMP13]] +; AVX512-NEXT: [[TMP31:%.*]] = and <8 x i1> [[TMP26]], [[TMP14]] +; AVX512-NEXT: [[TMP32:%.*]] = and <8 x i1> [[TMP27]], [[TMP15]] +; AVX512-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP28]] to <8 x double>* +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP33]], i32 8, <8 x i1> [[TMP29]]) +; AVX512-NEXT: [[TMP34:%.*]] = getelementptr double, double* [[TMP28]], i64 8 +; AVX512-NEXT: [[TMP35:%.*]] = bitcast double* [[TMP34]] to <8 x double>* +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP35]], i32 8, <8 x i1> [[TMP30]]) +; AVX512-NEXT: [[TMP36:%.*]] = getelementptr double, double* [[TMP28]], i64 16 +; AVX512-NEXT: [[TMP37:%.*]] = bitcast double* [[TMP36]] to <8 x double>* +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP37]], i32 8, <8 x i1> [[TMP31]]) +; AVX512-NEXT: [[TMP38:%.*]] = getelementptr double, double* [[TMP28]], i64 24 +; AVX512-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <8 x double>* +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP39]], i32 8, <8 x i1> [[TMP32]]) +; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 +; AVX512-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; AVX512-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !63 +; AVX512: middle.block: +; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]] +; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER16]] +; AVX512: for.body.preheader16: +; AVX512-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] +; AVX512-NEXT: br label [[FOR_BODY:%.*]] +; AVX512: for.body: +; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER16]] ] +; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]] +; AVX512-NEXT: [[TMP41:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; AVX512-NEXT: [[TMP42:%.*]] = and i8 [[TMP41]], 1 +; AVX512-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP42]], 0 +; AVX512-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]] +; AVX512: land.lhs.true: +; AVX512-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double*, double** [[IN]], i64 [[INDVARS_IV]] +; AVX512-NEXT: [[TMP43:%.*]] = load double*, double** [[ARRAYIDX2]], align 8 +; AVX512-NEXT: [[CMP3:%.*]] = icmp eq double* [[TMP43]], null +; AVX512-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]] +; AVX512: if.then: +; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]] +; AVX512-NEXT: store double 5.000000e-01, double* [[ARRAYIDX5]], align 8 +; AVX512-NEXT: br label [[FOR_INC]] +; AVX512: for.inc: +; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !64 +; AVX512: for.end: +; AVX512-NEXT: ret void +; entry: %out.addr = alloca double*, align 8 %in.addr = alloca double**, align 8 @@ -653,12 +3136,201 @@ for.end: ; preds = %for.cond ; out[i] = (double) 0.5; ;} -;AVX512-LABEL: @foo8 -;AVX512: call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* % -;AVX512: call void @llvm.masked.store.v8f64.p0v8f64 -;AVX512: ret void - define void @foo8(double* noalias %out, i32 ()** noalias %in, i8* noalias %trigger, i32 %size) #0 { +; AVX-LABEL: @foo8( +; AVX-NEXT: entry: +; AVX-NEXT: [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0 +; AVX-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]] +; AVX: for.body.preheader: +; AVX-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64 +; AVX-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[SIZE]], 16 +; AVX-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER16:%.*]], label [[VECTOR_PH:%.*]] +; AVX: vector.ph: +; AVX-NEXT: [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967280 +; AVX-NEXT: br label [[VECTOR_BODY:%.*]] +; AVX: vector.body: +; AVX-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; AVX-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[INDEX]] +; AVX-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <4 x i8>* +; AVX-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP1]], align 1 +; AVX-NEXT: [[TMP2:%.*]] = getelementptr i8, i8* [[TMP0]], i64 4 +; AVX-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <4 x i8>* +; AVX-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 1 +; AVX-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* [[TMP0]], i64 8 +; AVX-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <4 x i8>* +; AVX-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x i8>, <4 x i8>* [[TMP5]], align 1 +; AVX-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[TMP0]], i64 12 +; AVX-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <4 x i8>* +; AVX-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x i8>, <4 x i8>* [[TMP7]], align 1 +; AVX-NEXT: [[TMP8:%.*]] = and <4 x i8> [[WIDE_LOAD]], +; AVX-NEXT: [[TMP9:%.*]] = and <4 x i8> [[WIDE_LOAD10]], +; AVX-NEXT: [[TMP10:%.*]] = and <4 x i8> [[WIDE_LOAD11]], +; AVX-NEXT: [[TMP11:%.*]] = and <4 x i8> [[WIDE_LOAD12]], +; AVX-NEXT: [[TMP12:%.*]] = icmp ne <4 x i8> [[TMP8]], zeroinitializer +; AVX-NEXT: [[TMP13:%.*]] = icmp ne <4 x i8> [[TMP9]], zeroinitializer +; AVX-NEXT: [[TMP14:%.*]] = icmp ne <4 x i8> [[TMP10]], zeroinitializer +; AVX-NEXT: [[TMP15:%.*]] = icmp ne <4 x i8> [[TMP11]], zeroinitializer +; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 [[INDEX]] +; AVX-NEXT: [[TMP17:%.*]] = bitcast i32 ()** [[TMP16]] to <4 x i32 ()*>* +; AVX-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP17]], i32 8, <4 x i1> [[TMP12]], <4 x i32 ()*> undef) +; AVX-NEXT: [[TMP18:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP16]], i64 4 +; AVX-NEXT: [[TMP19:%.*]] = bitcast i32 ()** [[TMP18]] to <4 x i32 ()*>* +; AVX-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP19]], i32 8, <4 x i1> [[TMP13]], <4 x i32 ()*> undef) +; AVX-NEXT: [[TMP20:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP16]], i64 8 +; AVX-NEXT: [[TMP21:%.*]] = bitcast i32 ()** [[TMP20]] to <4 x i32 ()*>* +; AVX-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP21]], i32 8, <4 x i1> [[TMP14]], <4 x i32 ()*> undef) +; AVX-NEXT: [[TMP22:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP16]], i64 12 +; AVX-NEXT: [[TMP23:%.*]] = bitcast i32 ()** [[TMP22]] to <4 x i32 ()*>* +; AVX-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x i32 ()*> @llvm.masked.load.v4p0f_i32f.p0v4p0f_i32f(<4 x i32 ()*>* [[TMP23]], i32 8, <4 x i1> [[TMP15]], <4 x i32 ()*> undef) +; AVX-NEXT: [[TMP24:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer +; AVX-NEXT: [[TMP25:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD13]], zeroinitializer +; AVX-NEXT: [[TMP26:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD14]], zeroinitializer +; AVX-NEXT: [[TMP27:%.*]] = icmp ne <4 x i32 ()*> [[WIDE_MASKED_LOAD15]], zeroinitializer +; AVX-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]] +; AVX-NEXT: [[TMP29:%.*]] = and <4 x i1> [[TMP24]], [[TMP12]] +; AVX-NEXT: [[TMP30:%.*]] = and <4 x i1> [[TMP25]], [[TMP13]] +; AVX-NEXT: [[TMP31:%.*]] = and <4 x i1> [[TMP26]], [[TMP14]] +; AVX-NEXT: [[TMP32:%.*]] = and <4 x i1> [[TMP27]], [[TMP15]] +; AVX-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP28]] to <4 x double>* +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP33]], i32 8, <4 x i1> [[TMP29]]) +; AVX-NEXT: [[TMP34:%.*]] = getelementptr double, double* [[TMP28]], i64 4 +; AVX-NEXT: [[TMP35:%.*]] = bitcast double* [[TMP34]] to <4 x double>* +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP35]], i32 8, <4 x i1> [[TMP30]]) +; AVX-NEXT: [[TMP36:%.*]] = getelementptr double, double* [[TMP28]], i64 8 +; AVX-NEXT: [[TMP37:%.*]] = bitcast double* [[TMP36]] to <4 x double>* +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP37]], i32 8, <4 x i1> [[TMP31]]) +; AVX-NEXT: [[TMP38:%.*]] = getelementptr double, double* [[TMP28]], i64 12 +; AVX-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <4 x double>* +; AVX-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> , <4 x double>* [[TMP39]], i32 8, <4 x i1> [[TMP32]]) +; AVX-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 +; AVX-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; AVX-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !54 +; AVX: middle.block: +; AVX-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]] +; AVX-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER16]] +; AVX: for.body.preheader16: +; AVX-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] +; AVX-NEXT: br label [[FOR_BODY:%.*]] +; AVX: for.body: +; AVX-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER16]] ] +; AVX-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]] +; AVX-NEXT: [[TMP41:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; AVX-NEXT: [[TMP42:%.*]] = and i8 [[TMP41]], 1 +; AVX-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP42]], 0 +; AVX-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]] +; AVX: land.lhs.true: +; AVX-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[INDVARS_IV]] +; AVX-NEXT: [[TMP43:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8 +; AVX-NEXT: [[CMP3:%.*]] = icmp eq i32 ()* [[TMP43]], null +; AVX-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]] +; AVX: if.then: +; AVX-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]] +; AVX-NEXT: store double 5.000000e-01, double* [[ARRAYIDX5]], align 8 +; AVX-NEXT: br label [[FOR_INC]] +; AVX: for.inc: +; AVX-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; AVX-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; AVX-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !55 +; AVX: for.end: +; AVX-NEXT: ret void +; +; AVX512-LABEL: @foo8( +; AVX512-NEXT: entry: +; AVX512-NEXT: [[CMP5:%.*]] = icmp eq i32 [[SIZE:%.*]], 0 +; AVX512-NEXT: br i1 [[CMP5]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]] +; AVX512: for.body.preheader: +; AVX512-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[SIZE]] to i64 +; AVX512-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[SIZE]], 32 +; AVX512-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER16:%.*]], label [[VECTOR_PH:%.*]] +; AVX512: vector.ph: +; AVX512-NEXT: [[N_VEC:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967264 +; AVX512-NEXT: br label [[VECTOR_BODY:%.*]] +; AVX512: vector.body: +; AVX512-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; AVX512-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER:%.*]], i64 [[INDEX]] +; AVX512-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>* +; AVX512-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, <8 x i8>* [[TMP1]], align 1 +; AVX512-NEXT: [[TMP2:%.*]] = getelementptr i8, i8* [[TMP0]], i64 8 +; AVX512-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <8 x i8>* +; AVX512-NEXT: [[WIDE_LOAD10:%.*]] = load <8 x i8>, <8 x i8>* [[TMP3]], align 1 +; AVX512-NEXT: [[TMP4:%.*]] = getelementptr i8, i8* [[TMP0]], i64 16 +; AVX512-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP4]] to <8 x i8>* +; AVX512-NEXT: [[WIDE_LOAD11:%.*]] = load <8 x i8>, <8 x i8>* [[TMP5]], align 1 +; AVX512-NEXT: [[TMP6:%.*]] = getelementptr i8, i8* [[TMP0]], i64 24 +; AVX512-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to <8 x i8>* +; AVX512-NEXT: [[WIDE_LOAD12:%.*]] = load <8 x i8>, <8 x i8>* [[TMP7]], align 1 +; AVX512-NEXT: [[TMP8:%.*]] = and <8 x i8> [[WIDE_LOAD]], +; AVX512-NEXT: [[TMP9:%.*]] = and <8 x i8> [[WIDE_LOAD10]], +; AVX512-NEXT: [[TMP10:%.*]] = and <8 x i8> [[WIDE_LOAD11]], +; AVX512-NEXT: [[TMP11:%.*]] = and <8 x i8> [[WIDE_LOAD12]], +; AVX512-NEXT: [[TMP12:%.*]] = icmp ne <8 x i8> [[TMP8]], zeroinitializer +; AVX512-NEXT: [[TMP13:%.*]] = icmp ne <8 x i8> [[TMP9]], zeroinitializer +; AVX512-NEXT: [[TMP14:%.*]] = icmp ne <8 x i8> [[TMP10]], zeroinitializer +; AVX512-NEXT: [[TMP15:%.*]] = icmp ne <8 x i8> [[TMP11]], zeroinitializer +; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN:%.*]], i64 [[INDEX]] +; AVX512-NEXT: [[TMP17:%.*]] = bitcast i32 ()** [[TMP16]] to <8 x i32 ()*>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP17]], i32 8, <8 x i1> [[TMP12]], <8 x i32 ()*> undef) +; AVX512-NEXT: [[TMP18:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP16]], i64 8 +; AVX512-NEXT: [[TMP19:%.*]] = bitcast i32 ()** [[TMP18]] to <8 x i32 ()*>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP19]], i32 8, <8 x i1> [[TMP13]], <8 x i32 ()*> undef) +; AVX512-NEXT: [[TMP20:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP16]], i64 16 +; AVX512-NEXT: [[TMP21:%.*]] = bitcast i32 ()** [[TMP20]] to <8 x i32 ()*>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP21]], i32 8, <8 x i1> [[TMP14]], <8 x i32 ()*> undef) +; AVX512-NEXT: [[TMP22:%.*]] = getelementptr i32 ()*, i32 ()** [[TMP16]], i64 24 +; AVX512-NEXT: [[TMP23:%.*]] = bitcast i32 ()** [[TMP22]] to <8 x i32 ()*>* +; AVX512-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <8 x i32 ()*> @llvm.masked.load.v8p0f_i32f.p0v8p0f_i32f(<8 x i32 ()*>* [[TMP23]], i32 8, <8 x i1> [[TMP15]], <8 x i32 ()*> undef) +; AVX512-NEXT: [[TMP24:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD]], zeroinitializer +; AVX512-NEXT: [[TMP25:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD13]], zeroinitializer +; AVX512-NEXT: [[TMP26:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD14]], zeroinitializer +; AVX512-NEXT: [[TMP27:%.*]] = icmp ne <8 x i32 ()*> [[WIDE_MASKED_LOAD15]], zeroinitializer +; AVX512-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, double* [[OUT:%.*]], i64 [[INDEX]] +; AVX512-NEXT: [[TMP29:%.*]] = and <8 x i1> [[TMP24]], [[TMP12]] +; AVX512-NEXT: [[TMP30:%.*]] = and <8 x i1> [[TMP25]], [[TMP13]] +; AVX512-NEXT: [[TMP31:%.*]] = and <8 x i1> [[TMP26]], [[TMP14]] +; AVX512-NEXT: [[TMP32:%.*]] = and <8 x i1> [[TMP27]], [[TMP15]] +; AVX512-NEXT: [[TMP33:%.*]] = bitcast double* [[TMP28]] to <8 x double>* +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP33]], i32 8, <8 x i1> [[TMP29]]) +; AVX512-NEXT: [[TMP34:%.*]] = getelementptr double, double* [[TMP28]], i64 8 +; AVX512-NEXT: [[TMP35:%.*]] = bitcast double* [[TMP34]] to <8 x double>* +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP35]], i32 8, <8 x i1> [[TMP30]]) +; AVX512-NEXT: [[TMP36:%.*]] = getelementptr double, double* [[TMP28]], i64 16 +; AVX512-NEXT: [[TMP37:%.*]] = bitcast double* [[TMP36]] to <8 x double>* +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP37]], i32 8, <8 x i1> [[TMP31]]) +; AVX512-NEXT: [[TMP38:%.*]] = getelementptr double, double* [[TMP28]], i64 24 +; AVX512-NEXT: [[TMP39:%.*]] = bitcast double* [[TMP38]] to <8 x double>* +; AVX512-NEXT: call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> , <8 x double>* [[TMP39]], i32 8, <8 x i1> [[TMP32]]) +; AVX512-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 +; AVX512-NEXT: [[TMP40:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; AVX512-NEXT: br i1 [[TMP40]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !66 +; AVX512: middle.block: +; AVX512-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]] +; AVX512-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER16]] +; AVX512: for.body.preheader16: +; AVX512-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] +; AVX512-NEXT: br label [[FOR_BODY:%.*]] +; AVX512: for.body: +; AVX512-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER16]] ] +; AVX512-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[TRIGGER]], i64 [[INDVARS_IV]] +; AVX512-NEXT: [[TMP41:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; AVX512-NEXT: [[TMP42:%.*]] = and i8 [[TMP41]], 1 +; AVX512-NEXT: [[TOBOOL:%.*]] = icmp eq i8 [[TMP42]], 0 +; AVX512-NEXT: br i1 [[TOBOOL]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]] +; AVX512: land.lhs.true: +; AVX512-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32 ()*, i32 ()** [[IN]], i64 [[INDVARS_IV]] +; AVX512-NEXT: [[TMP43:%.*]] = load i32 ()*, i32 ()** [[ARRAYIDX2]], align 8 +; AVX512-NEXT: [[CMP3:%.*]] = icmp eq i32 ()* [[TMP43]], null +; AVX512-NEXT: br i1 [[CMP3]], label [[FOR_INC]], label [[IF_THEN:%.*]] +; AVX512: if.then: +; AVX512-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[OUT]], i64 [[INDVARS_IV]] +; AVX512-NEXT: store double 5.000000e-01, double* [[ARRAYIDX5]], align 8 +; AVX512-NEXT: br label [[FOR_INC]] +; AVX512: for.inc: +; AVX512-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; AVX512-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; AVX512-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop !67 +; AVX512: for.end: +; AVX512-NEXT: ret void +; entry: %out.addr = alloca double*, align 8 %in.addr = alloca i32 ()**, align 8 -- 2.7.4