From: Alexey Bataev Date: Fri, 11 Jan 2019 20:21:14 +0000 (+0000) Subject: [SLP]Update test checks for the SPL vectorizer, NFC. X-Git-Tag: llvmorg-8.0.0-rc1~387 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=ce2c8b3360c2270fb95a2a98bef2bedeef476415;p=platform%2Fupstream%2Fllvm.git [SLP]Update test checks for the SPL vectorizer, NFC. llvm-svn: 350967 --- diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/64-bit-vector.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/64-bit-vector.ll index edc8042..ad970b2 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/64-bit-vector.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/64-bit-vector.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=generic < %s | FileCheck %s ; RUN: opt -S -slp-vectorizer -mtriple=aarch64-apple-ios -mcpu=cyclone < %s | FileCheck %s ; Currently disabled for a few subtargets (e.g. Kryo): @@ -5,14 +6,36 @@ ; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=generic -slp-min-reg-size=128 < %s | FileCheck --check-prefix=NO_SLP %s define void @f(float* %r, float* %w) { +; CHECK-LABEL: @f( +; CHECK-NEXT: [[R0:%.*]] = getelementptr inbounds float, float* [[R:%.*]], i64 0 +; CHECK-NEXT: [[R1:%.*]] = getelementptr inbounds float, float* [[R]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[R0]] to <2 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x float> [[TMP2]], [[TMP2]] +; CHECK-NEXT: [[W0:%.*]] = getelementptr inbounds float, float* [[W:%.*]], i64 0 +; CHECK-NEXT: [[W1:%.*]] = getelementptr inbounds float, float* [[W]], i64 1 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[W0]] to <2 x float>* +; CHECK-NEXT: store <2 x float> [[TMP3]], <2 x float>* [[TMP4]], align 4 +; CHECK-NEXT: ret void +; +; NO_SLP-LABEL: @f( +; NO_SLP-NEXT: [[R0:%.*]] = getelementptr inbounds float, float* [[R:%.*]], i64 0 +; NO_SLP-NEXT: [[R1:%.*]] = getelementptr inbounds float, float* [[R]], i64 1 +; NO_SLP-NEXT: [[F0:%.*]] = load float, float* [[R0]] +; NO_SLP-NEXT: [[F1:%.*]] = load float, float* [[R1]] +; NO_SLP-NEXT: [[ADD0:%.*]] = fadd float [[F0]], [[F0]] +; NO_SLP-NEXT: [[ADD1:%.*]] = fadd float [[F1]], [[F1]] +; NO_SLP-NEXT: [[W0:%.*]] = getelementptr inbounds float, float* [[W:%.*]], i64 0 +; NO_SLP-NEXT: [[W1:%.*]] = getelementptr inbounds float, float* [[W]], i64 1 +; NO_SLP-NEXT: store float [[ADD0]], float* [[W0]] +; NO_SLP-NEXT: store float [[ADD1]], float* [[W1]] +; NO_SLP-NEXT: ret void +; %r0 = getelementptr inbounds float, float* %r, i64 0 %r1 = getelementptr inbounds float, float* %r, i64 1 %f0 = load float, float* %r0 %f1 = load float, float* %r1 %add0 = fadd float %f0, %f0 -; CHECK: fadd <2 x float> -; NO_SLP: fadd float -; NO_SLP: fadd float %add1 = fadd float %f1, %f1 %w0 = getelementptr inbounds float, float* %w, i64 0 %w1 = getelementptr inbounds float, float* %w, i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll index 2bce59c..3f831fd 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -slp-vectorizer %s -slp-threshold=-10 | FileCheck %s target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" target triple = "aarch64--linux-gnu" @@ -5,17 +6,27 @@ target triple = "aarch64--linux-gnu" %structA = type { [2 x float] } define void @test1(%structA* nocapture readonly %J, i32 %xmin, i32 %ymin) { -; CHECK-LABEL: test1 -; CHECK: %arrayidx4 = getelementptr inbounds %structA, %structA* %J, i64 0, i32 0, i64 0 -; CHECK: %arrayidx9 = getelementptr inbounds %structA, %structA* %J, i64 0, i32 0, i64 1 -; CHECK: %3 = bitcast float* %arrayidx4 to <2 x float>* -; CHECK: %4 = load <2 x float>, <2 x float>* %3, align 4 -; CHECK: %5 = fsub fast <2 x float> %2, %4 -; CHECK: %6 = fmul fast <2 x float> %5, %5 -; CHECK: %7 = extractelement <2 x float> %6, i32 0 -; CHECK: %8 = extractelement <2 x float> %6, i32 1 -; CHECK: %add = fadd fast float %7, %8 -; CHECK: %cmp = fcmp oeq float %add, 0.000000e+00 +; CHECK-LABEL: @test1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> undef, i32 [[XMIN:%.*]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[YMIN:%.*]], i32 1 +; CHECK-NEXT: br label [[FOR_BODY3_LR_PH:%.*]] +; CHECK: for.body3.lr.ph: +; CHECK-NEXT: [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float> +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [[STRUCTA:%.*]], %structA* [[J:%.*]], i64 0, i32 0, i64 0 +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [[STRUCTA]], %structA* [[J]], i64 0, i32 0, i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[ARRAYIDX4]] to <2 x float>* +; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = fsub fast <2 x float> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = fmul fast <2 x float> [[TMP5]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP6]], i32 1 +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[ADD]], 0.000000e+00 +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY3_LR_PH]], label [[FOR_END27:%.*]] +; CHECK: for.end27: +; CHECK-NEXT: ret void +; entry: br label %for.body3.lr.ph @@ -40,17 +51,27 @@ for.end27: } define void @test2(%structA* nocapture readonly %J, i32 %xmin, i32 %ymin) { -; CHECK-LABEL: test2 -; CHECK: %arrayidx4 = getelementptr inbounds %structA, %structA* %J, i64 0, i32 0, i64 0 -; CHECK: %arrayidx9 = getelementptr inbounds %structA, %structA* %J, i64 0, i32 0, i64 1 -; CHECK: %3 = bitcast float* %arrayidx4 to <2 x float>* -; CHECK: %4 = load <2 x float>, <2 x float>* %3, align 4 -; CHECK: %5 = fsub fast <2 x float> %2, %4 -; CHECK: %6 = fmul fast <2 x float> %5, %5 -; CHECK: %7 = extractelement <2 x float> %6, i32 0 -; CHECK: %8 = extractelement <2 x float> %6, i32 1 -; CHECK: %add = fadd fast float %8, %7 -; CHECK: %cmp = fcmp oeq float %add, 0.000000e+00 +; CHECK-LABEL: @test2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> undef, i32 [[XMIN:%.*]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[YMIN:%.*]], i32 1 +; CHECK-NEXT: br label [[FOR_BODY3_LR_PH:%.*]] +; CHECK: for.body3.lr.ph: +; CHECK-NEXT: [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float> +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [[STRUCTA:%.*]], %structA* [[J:%.*]], i64 0, i32 0, i64 0 +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [[STRUCTA]], %structA* [[J]], i64 0, i32 0, i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[ARRAYIDX4]] to <2 x float>* +; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = fsub fast <2 x float> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = fmul fast <2 x float> [[TMP5]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x float> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP6]], i32 1 +; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq float [[ADD]], 0.000000e+00 +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY3_LR_PH]], label [[FOR_END27:%.*]] +; CHECK: for.end27: +; CHECK-NEXT: ret void +; entry: br label %for.body3.lr.ph diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll index a4c655c..401776a 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -S -slp-vectorizer -instcombine -pass-remarks-output=%t | FileCheck %s ; RUN: cat %t | FileCheck -check-prefix=REMARK %s ; RUN: opt < %s -S -passes='slp-vectorizer,instcombine' -pass-remarks-output=%t | FileCheck %s @@ -6,25 +7,25 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64--linux-gnu" -; CHECK-LABEL: @gather_multiple_use( -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[C:%.*]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[A:%.*]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[B:%.*]], i32 2 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[D:%.*]], i32 3 -; CHECK-NEXT: [[TMP5:%.*]] = lshr <4 x i32> [[TMP4]], -; CHECK-NEXT: [[TMP6:%.*]] = and <4 x i32> [[TMP5]], -; CHECK-NEXT: [[TMP7:%.*]] = mul nuw <4 x i32> [[TMP6]], -; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i32> [[TMP4]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = xor <4 x i32> [[TMP8]], [[TMP7]] -; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> [[TMP9]]) -; CHECK-NEXT: ret i32 [[TMP10]] -; ; REMARK-LABEL: Function: gather_multiple_use ; REMARK: Args: ; REMARK-NEXT: - String: 'Vectorized horizontal reduction with cost ' ; REMARK-NEXT: - Cost: '-7' ; define internal i32 @gather_multiple_use(i32 %a, i32 %b, i32 %c, i32 %d) { +; CHECK-LABEL: @gather_multiple_use( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[C:%.*]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[A:%.*]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[B:%.*]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[D:%.*]], i32 3 +; CHECK-NEXT: [[TMP5:%.*]] = lshr <4 x i32> [[TMP4]], +; CHECK-NEXT: [[TMP6:%.*]] = and <4 x i32> [[TMP5]], +; CHECK-NEXT: [[TMP7:%.*]] = mul nuw <4 x i32> [[TMP6]], +; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i32> [[TMP4]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <4 x i32> [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> [[TMP9]]) +; CHECK-NEXT: ret i32 [[TMP10]] +; %tmp00 = lshr i32 %a, 15 %tmp01 = and i32 %tmp00, 65537 %tmp02 = mul nuw i32 %tmp01, 65535 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll index 3f61875b..db02f55 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -slp-vectorizer -slp-threshold=-18 -dce -instcombine -pass-remarks-output=%t < %s | FileCheck %s ; RUN: cat %t | FileCheck -check-prefix=YAML %s ; RUN: opt -S -passes='slp-vectorizer,dce,instcombine' -slp-threshold=-18 -pass-remarks-output=%t < %s | FileCheck %s @@ -22,12 +23,6 @@ target triple = "aarch64--linux-gnu" ; } ; -; CHECK-LABEL: @getelementptr_4x32 -; -; CHECK: [[A:%[a-zA-Z0-9.]+]] = add nsw <4 x i32> -; CHECK: [[X:%[a-zA-Z0-9.]+]] = extractelement <4 x i32> [[A]] -; CHECK: sext i32 [[X]] to i64 - ; YAML: --- !Passed ; YAML-NEXT: Pass: slp-vectorizer ; YAML-NEXT: Name: VectorizedList @@ -49,6 +44,56 @@ target triple = "aarch64--linux-gnu" ; YAML-NEXT: - TreeSize: '3' define i32 @getelementptr_4x32(i32* nocapture readonly %g, i32 %n, i32 %x, i32 %y, i32 %z) { +; CHECK-LABEL: @getelementptr_4x32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP31:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP31]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> , i32 [[X:%.*]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[Y:%.*]], i32 2 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[Z:%.*]], i32 3 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup.loopexit: +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP21:%.*]], i32 1 +; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP3]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ] +; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]] +; CHECK: for.body: +; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x i32> [ zeroinitializer, [[FOR_BODY_PREHEADER]] ], [ [[TMP21]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32> [[TMP4]], i32 0 +; CHECK-NEXT: [[T4:%.*]] = shl nsw i32 [[TMP5]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> undef, i32 [[T4]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[TMP2]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = sext i32 [[TMP9]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[G:%.*]], i64 [[TMP10]] +; CHECK-NEXT: [[T6:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[TMP4]], i32 1 +; CHECK-NEXT: [[ADD1:%.*]] = add nsw i32 [[T6]], [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[TMP8]], i32 1 +; CHECK-NEXT: [[TMP13:%.*]] = sext i32 [[TMP12]] to i64 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP13]] +; CHECK-NEXT: [[T8:%.*]] = load i32, i32* [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[ADD6:%.*]] = add nsw i32 [[ADD1]], [[T8]] +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i32> [[TMP8]], i32 2 +; CHECK-NEXT: [[TMP15:%.*]] = sext i32 [[TMP14]] to i64 +; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP15]] +; CHECK-NEXT: [[T10:%.*]] = load i32, i32* [[ARRAYIDX10]], align 4 +; CHECK-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD6]], [[T10]] +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i32> [[TMP8]], i32 3 +; CHECK-NEXT: [[TMP17:%.*]] = sext i32 [[TMP16]] to i64 +; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP17]] +; CHECK-NEXT: [[T12:%.*]] = load i32, i32* [[ARRAYIDX15]], align 4 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <2 x i32> , i32 [[ADD11]], i32 1 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i32> undef, i32 [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <2 x i32> [[TMP19]], i32 [[T12]], i32 1 +; CHECK-NEXT: [[TMP21]] = add nsw <2 x i32> [[TMP18]], [[TMP20]] +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i32> [[TMP21]], i32 0 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[TMP22]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]] +; entry: %cmp31 = icmp sgt i32 %n, 0 br i1 %cmp31, label %for.body.preheader, label %for.cond.cleanup @@ -88,12 +133,6 @@ for.body: br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body } -; CHECK-LABEL: @getelementptr_2x32 -; -; CHECK: [[A:%[a-zA-Z0-9.]+]] = add nsw <2 x i32> -; CHECK: [[X:%[a-zA-Z0-9.]+]] = extractelement <2 x i32> [[A]] -; CHECK: sext i32 [[X]] to i64 - ; YAML: --- !Passed ; YAML-NEXT: Pass: slp-vectorizer ; YAML-NEXT: Name: VectorizedList @@ -115,6 +154,54 @@ for.body: ; YAML-NEXT: - TreeSize: '3' define i32 @getelementptr_2x32(i32* nocapture readonly %g, i32 %n, i32 %x, i32 %y, i32 %z) { +; CHECK-LABEL: @getelementptr_2x32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP31:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP31]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> undef, i32 [[Y:%.*]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[Z:%.*]], i32 1 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup.loopexit: +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[TMP18:%.*]], i32 1 +; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP2]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ] +; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]] +; CHECK: for.body: +; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x i32> [ zeroinitializer, [[FOR_BODY_PREHEADER]] ], [ [[TMP18]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP3]], i32 0 +; CHECK-NEXT: [[T4:%.*]] = shl nsw i32 [[TMP4]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = sext i32 [[T4]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[G:%.*]], i64 [[TMP5]] +; CHECK-NEXT: [[T6:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP3]], i32 1 +; CHECK-NEXT: [[ADD1:%.*]] = add nsw i32 [[T6]], [[TMP6]] +; CHECK-NEXT: [[T7:%.*]] = or i32 [[T4]], 1 +; CHECK-NEXT: [[TMP7:%.*]] = sext i32 [[T7]] to i64 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP7]] +; CHECK-NEXT: [[T8:%.*]] = load i32, i32* [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[ADD6:%.*]] = add nsw i32 [[ADD1]], [[T8]] +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> undef, i32 [[T4]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = add nsw <2 x i32> [[TMP1]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x i32> [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = sext i32 [[TMP11]] to i64 +; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP12]] +; CHECK-NEXT: [[T10:%.*]] = load i32, i32* [[ARRAYIDX10]], align 4 +; CHECK-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD6]], [[T10]] +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i32> [[TMP10]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = sext i32 [[TMP13]] to i64 +; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[G]], i64 [[TMP14]] +; CHECK-NEXT: [[T12:%.*]] = load i32, i32* [[ARRAYIDX15]], align 4 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x i32> , i32 [[ADD11]], i32 1 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <2 x i32> undef, i32 [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <2 x i32> [[TMP16]], i32 [[T12]], i32 1 +; CHECK-NEXT: [[TMP18]] = add nsw <2 x i32> [[TMP15]], [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x i32> [[TMP18]], i32 0 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[TMP19]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]] +; entry: %cmp31 = icmp sgt i32 %n, 0 br i1 %cmp31, label %for.body.preheader, label %for.cond.cleanup diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll index 02cf09d..c4a584f 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll @@ -1,6 +1,5 @@ -; RUN: opt -slp-vectorizer -slp-threshold=-6 -S -pass-remarks-output=%t < %s | FileCheck %s -; RUN: cat %t | FileCheck -check-prefix=YAML %s -; RUN: opt -passes=slp-vectorizer -slp-threshold=-6 -S -pass-remarks-output=%t < %s | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -slp-vectorizer -slp-threshold=-6 -S -pass-remarks-output=%t < %s | FileCheck %s ; RUN: cat %t | FileCheck -check-prefix=YAML %s @@ -10,11 +9,6 @@ target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" target triple = "aarch64--linux" -; CHECK-LABEL: test_select -; CHECK: load <4 x i32> -; CHECK: load <4 x i32> -; CHECK: select <4 x i1> - ; YAML: --- !Passed ; YAML-NEXT: Pass: slp-vectorizer ; YAML-NEXT: Name: VectorizedHorizontalReduction @@ -26,6 +20,49 @@ target triple = "aarch64--linux" ; YAML-NEXT: - TreeSize: '8' define i32 @test_select(i32* noalias nocapture readonly %blk1, i32* noalias nocapture readonly %blk2, i32 %lx, i32 %h) { +; CHECK-LABEL: @test_select( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP_22:%.*]] = icmp sgt i32 [[H:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP_22]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] +; CHECK: for.body.lr.ph: +; CHECK-NEXT: [[IDX_EXT:%.*]] = sext i32 [[LX:%.*]] to i64 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[S_026:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[J_025:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[P2_024:%.*]] = phi i32* [ [[BLK2:%.*]], [[FOR_BODY_LR_PH]] ], [ [[ADD_PTR29:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[P1_023:%.*]] = phi i32* [ [[BLK1:%.*]], [[FOR_BODY_LR_PH]] ], [ [[ADD_PTR:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[P1_023]], i64 1 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[P2_024]], i64 1 +; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[P1_023]], i64 2 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[P2_024]], i64 2 +; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, i32* [[P1_023]], i64 3 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[P1_023]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[P2_024]], i64 3 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[P2_024]] to <4 x i32>* +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = sub nsw <4 x i32> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], zeroinitializer +; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <4 x i32> zeroinitializer, [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32> [[TMP4]] +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 undef, [[S_026]] +; CHECK-NEXT: [[ADD11:%.*]] = add nsw i32 [[ADD]], undef +; CHECK-NEXT: [[ADD19:%.*]] = add nsw i32 [[ADD11]], undef +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> [[TMP7]]) +; CHECK-NEXT: [[OP_EXTRA]] = add nsw i32 [[TMP8]], [[S_026]] +; CHECK-NEXT: [[ADD27:%.*]] = add nsw i32 [[ADD19]], undef +; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i32, i32* [[P1_023]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[ADD_PTR29]] = getelementptr inbounds i32, i32* [[P2_024]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[J_025]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[H]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]] +; CHECK: for.end.loopexit: +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: [[S_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_EXTRA]], [[FOR_END_LOOPEXIT]] ] +; CHECK-NEXT: ret i32 [[S_0_LCSSA]] +; entry: %cmp.22 = icmp sgt i32 %h, 0 br i1 %cmp.22, label %for.body.lr.ph, label %for.end @@ -104,11 +141,6 @@ for.end: ; preds = %for.end.loopexit, % ;; p2 += lx; ;; } define i32 @reduction_with_br(i32* noalias nocapture readonly %blk1, i32* noalias nocapture readonly %blk2, i32 %lx, i32 %h, i32 %lim) { -; CHECK-LABEL: reduction_with_br -; CHECK: load <4 x i32> -; CHECK: load <4 x i32> -; CHECK: mul nsw <4 x i32> - ; YAML: --- !Passed ; YAML-NEXT: Pass: slp-vectorizer ; YAML-NEXT: Name: VectorizedHorizontalReduction @@ -118,7 +150,49 @@ define i32 @reduction_with_br(i32* noalias nocapture readonly %blk1, i32* noalia ; YAML-NEXT: - Cost: '-11' ; YAML-NEXT: - String: ' and with tree size ' ; YAML-NEXT: - TreeSize: '3' - +; CHECK-LABEL: @reduction_with_br( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP_16:%.*]] = icmp sgt i32 [[H:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP_16]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] +; CHECK: for.body.lr.ph: +; CHECK-NEXT: [[IDX_EXT:%.*]] = sext i32 [[LX:%.*]] to i64 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[S_020:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[OP_EXTRA:%.*]], [[IF_END:%.*]] ] +; CHECK-NEXT: [[J_019:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[IF_END]] ] +; CHECK-NEXT: [[P2_018:%.*]] = phi i32* [ [[BLK2:%.*]], [[FOR_BODY_LR_PH]] ], [ [[ADD_PTR16:%.*]], [[IF_END]] ] +; CHECK-NEXT: [[P1_017:%.*]] = phi i32* [ [[BLK1:%.*]], [[FOR_BODY_LR_PH]] ], [ [[ADD_PTR:%.*]], [[IF_END]] ] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[P1_017]], i64 1 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[P2_018]], i64 1 +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[P1_017]], i64 2 +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[P2_018]], i64 2 +; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[P1_017]], i64 3 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[P1_017]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds i32, i32* [[P2_018]], i64 3 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[P2_018]] to <4 x i32>* +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <4 x i32> [[TMP3]], [[TMP1]] +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 undef, [[S_020]] +; CHECK-NEXT: [[ADD5:%.*]] = add nsw i32 [[ADD]], undef +; CHECK-NEXT: [[ADD9:%.*]] = add nsw i32 [[ADD5]], undef +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> [[TMP4]]) +; CHECK-NEXT: [[OP_EXTRA]] = add nsw i32 [[TMP5]], [[S_020]] +; CHECK-NEXT: [[ADD13:%.*]] = add nsw i32 [[ADD9]], undef +; CHECK-NEXT: [[CMP14:%.*]] = icmp slt i32 [[OP_EXTRA]], [[LIM:%.*]] +; CHECK-NEXT: br i1 [[CMP14]], label [[IF_END]], label [[FOR_END_LOOPEXIT:%.*]] +; CHECK: if.end: +; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i32, i32* [[P1_017]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[ADD_PTR16]] = getelementptr inbounds i32, i32* [[P2_018]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[J_019]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC]], [[H]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT]] +; CHECK: for.end.loopexit: +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: [[S_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_EXTRA]], [[FOR_END_LOOPEXIT]] ] +; CHECK-NEXT: ret i32 [[S_1]] +; entry: %cmp.16 = icmp sgt i32 %h, 0 br i1 %cmp.16, label %for.body.lr.ph, label %for.end @@ -172,11 +246,6 @@ for.end: ; preds = %for.end.loopexit, % ret i32 %s.1 } -; CHECK: test_unrolled_select -; CHECK: load <8 x i8> -; CHECK: load <8 x i8> -; CHECK: select <8 x i1> - ; YAML: --- !Passed ; YAML-NEXT: Pass: slp-vectorizer ; YAML-NEXT: Name: VectorizedHorizontalReduction @@ -188,6 +257,66 @@ for.end: ; preds = %for.end.loopexit, % ; YAML-NEXT: - TreeSize: '10' define i32 @test_unrolled_select(i8* noalias nocapture readonly %blk1, i8* noalias nocapture readonly %blk2, i32 %lx, i32 %h, i32 %lim) #0 { +; CHECK-LABEL: @test_unrolled_select( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP_43:%.*]] = icmp sgt i32 [[H:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP_43]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] +; CHECK: for.body.lr.ph: +; CHECK-NEXT: [[IDX_EXT:%.*]] = sext i32 [[LX:%.*]] to i64 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[S_047:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[OP_EXTRA:%.*]], [[IF_END_86:%.*]] ] +; CHECK-NEXT: [[J_046:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[IF_END_86]] ] +; CHECK-NEXT: [[P2_045:%.*]] = phi i8* [ [[BLK2:%.*]], [[FOR_BODY_LR_PH]] ], [ [[ADD_PTR88:%.*]], [[IF_END_86]] ] +; CHECK-NEXT: [[P1_044:%.*]] = phi i8* [ [[BLK1:%.*]], [[FOR_BODY_LR_PH]] ], [ [[ADD_PTR:%.*]], [[IF_END_86]] ] +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i8, i8* [[P1_044]], i64 1 +; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i8, i8* [[P2_045]], i64 1 +; CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds i8, i8* [[P1_044]], i64 2 +; CHECK-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds i8, i8* [[P2_045]], i64 2 +; CHECK-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds i8, i8* [[P1_044]], i64 3 +; CHECK-NEXT: [[ARRAYIDX30:%.*]] = getelementptr inbounds i8, i8* [[P2_045]], i64 3 +; CHECK-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds i8, i8* [[P1_044]], i64 4 +; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds i8, i8* [[P2_045]], i64 4 +; CHECK-NEXT: [[ARRAYIDX50:%.*]] = getelementptr inbounds i8, i8* [[P1_044]], i64 5 +; CHECK-NEXT: [[ARRAYIDX52:%.*]] = getelementptr inbounds i8, i8* [[P2_045]], i64 5 +; CHECK-NEXT: [[ARRAYIDX61:%.*]] = getelementptr inbounds i8, i8* [[P1_044]], i64 6 +; CHECK-NEXT: [[ARRAYIDX63:%.*]] = getelementptr inbounds i8, i8* [[P2_045]], i64 6 +; CHECK-NEXT: [[ARRAYIDX72:%.*]] = getelementptr inbounds i8, i8* [[P1_044]], i64 7 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[P1_044]] to <8 x i8>* +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i8> [[TMP1]] to <8 x i32> +; CHECK-NEXT: [[ARRAYIDX74:%.*]] = getelementptr inbounds i8, i8* [[P2_045]], i64 7 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[P2_045]] to <8 x i8>* +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[TMP3]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = zext <8 x i8> [[TMP4]] to <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp slt <8 x i32> [[TMP6]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = sub nsw <8 x i32> zeroinitializer, [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = select <8 x i1> [[TMP7]], <8 x i32> [[TMP8]], <8 x i32> [[TMP6]] +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 undef, [[S_047]] +; CHECK-NEXT: [[ADD16:%.*]] = add nsw i32 [[ADD]], undef +; CHECK-NEXT: [[ADD27:%.*]] = add nsw i32 [[ADD16]], undef +; CHECK-NEXT: [[ADD38:%.*]] = add nsw i32 [[ADD27]], undef +; CHECK-NEXT: [[ADD49:%.*]] = add nsw i32 [[ADD38]], undef +; CHECK-NEXT: [[ADD60:%.*]] = add nsw i32 [[ADD49]], undef +; CHECK-NEXT: [[ADD71:%.*]] = add nsw i32 [[ADD60]], undef +; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v8i32(<8 x i32> [[TMP9]]) +; CHECK-NEXT: [[OP_EXTRA]] = add nsw i32 [[TMP10]], [[S_047]] +; CHECK-NEXT: [[ADD82:%.*]] = add nsw i32 [[ADD71]], undef +; CHECK-NEXT: [[CMP83:%.*]] = icmp slt i32 [[OP_EXTRA]], [[LIM:%.*]] +; CHECK-NEXT: br i1 [[CMP83]], label [[IF_END_86]], label [[FOR_END_LOOPEXIT:%.*]] +; CHECK: if.end.86: +; CHECK-NEXT: [[ADD_PTR]] = getelementptr inbounds i8, i8* [[P1_044]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[ADD_PTR88]] = getelementptr inbounds i8, i8* [[P2_045]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[J_046]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC]], [[H]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END_LOOPEXIT]] +; CHECK: for.end.loopexit: +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: [[S_1:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_EXTRA]], [[FOR_END_LOOPEXIT]] ] +; CHECK-NEXT: ret i32 [[S_1]] +; entry: %cmp.43 = icmp sgt i32 %h, 0 br i1 %cmp.43, label %for.body.lr.ph, label %for.end diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/minimum-sizes.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/minimum-sizes.ll index 7e1d670..0429665 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/minimum-sizes.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/minimum-sizes.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -slp-vectorizer < %s | FileCheck %s target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" @@ -7,10 +8,19 @@ target triple = "aarch64--linux-gnu" ; should not compute a smaller size for %k.13 since it is in a use-def cycle ; and cannot be demoted. ; -; CHECK-LABEL: @PR26364 -; CHECK: %k.13 = phi i32 -; define fastcc void @PR26364() { +; CHECK-LABEL: @PR26364( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 undef, label [[FOR_END11:%.*]], label [[FOR_COND4:%.*]] +; CHECK: for.cond4: +; CHECK-NEXT: [[K_13:%.*]] = phi i32 [ undef, [[ENTRY:%.*]] ], [ [[K_3:%.*]], [[FOR_COND4]] ] +; CHECK-NEXT: [[E_02:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ 0, [[FOR_COND4]] ] +; CHECK-NEXT: [[E_1:%.*]] = select i1 undef, i32 [[E_02]], i32 0 +; CHECK-NEXT: [[K_3]] = select i1 undef, i32 [[K_13]], i32 undef +; CHECK-NEXT: br label [[FOR_COND4]] +; CHECK: for.end11: +; CHECK-NEXT: ret void +; entry: br i1 undef, label %for.end11, label %for.cond4 @@ -29,10 +39,25 @@ for.end11: ; every root in the vectorizable tree when computing minimum sizes since one ; root may require fewer bits than another. ; -; CHECK-LABEL: @PR26629 -; CHECK-NOT: {{.*}} and <2 x i72> -; define void @PR26629(i32* %c) { +; CHECK-LABEL: @PR26629( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 undef, label [[FOR_PH:%.*]], label [[FOR_END:%.*]] +; CHECK: for.ph: +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[C:%.*]], align 4 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[D:%.*]] = phi i72 [ 576507472957710340, [[FOR_PH]] ], [ [[BF_SET17:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[SUB:%.*]] = sub i32 [[TMP0]], undef +; CHECK-NEXT: [[BF_CLEAR13:%.*]] = and i72 [[D]], -576460748008464384 +; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[SUB]] to i72 +; CHECK-NEXT: [[BF_VALUE15:%.*]] = and i72 [[TMP1]], 8191 +; CHECK-NEXT: [[BF_CLEAR16:%.*]] = or i72 [[BF_VALUE15]], [[BF_CLEAR13]] +; CHECK-NEXT: [[BF_SET17]] = or i72 [[BF_CLEAR16]], undef +; CHECK-NEXT: br label [[FOR_BODY]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; entry: br i1 undef, label %for.ph, label %for.end diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/mismatched-intrinsics.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/mismatched-intrinsics.ll index 3d6da12..64b8743 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/mismatched-intrinsics.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/mismatched-intrinsics.ll @@ -1,11 +1,16 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -slp-vectorizer %s | FileCheck %s target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" target triple = "arm64-apple-ios5.0.0" define i64 @mismatched_intrinsics(<4 x i32> %in1, <2 x i32> %in2) nounwind { -; CHECK-LABEL: @mismatched_intrinsics -; CHECK: call i64 @llvm.arm64.neon.saddlv.i64.v4i32 -; CHECK: call i64 @llvm.arm64.neon.saddlv.i64.v2i32 +; CHECK-LABEL: @mismatched_intrinsics( +; CHECK-NEXT: [[VADDLVQ_S32_I:%.*]] = tail call i64 @llvm.arm64.neon.saddlv.i64.v4i32(<4 x i32> [[IN1:%.*]]) +; CHECK-NEXT: [[VADDLV_S32_I:%.*]] = tail call i64 @llvm.arm64.neon.saddlv.i64.v2i32(<2 x i32> [[IN2:%.*]]) +; CHECK-NEXT: [[TST:%.*]] = icmp sgt i64 [[VADDLVQ_S32_I]], [[VADDLV_S32_I]] +; CHECK-NEXT: [[EQUAL:%.*]] = sext i1 [[TST]] to i64 +; CHECK-NEXT: ret i64 [[EQUAL]] +; %vaddlvq_s32.i = tail call i64 @llvm.arm64.neon.saddlv.i64.v4i32(<4 x i32> %in1) #2 %vaddlv_s32.i = tail call i64 @llvm.arm64.neon.saddlv.i64.v2i32(<2 x i32> %in2) #2 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/nontemporal.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/nontemporal.ll index 87d021d..98c0332 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/nontemporal.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/nontemporal.ll @@ -1,12 +1,22 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -basicaa -slp-vectorizer -dce < %s | FileCheck %s target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" target triple = "arm64-apple-ios5.0.0" -; CHECK-LABEL: @foo define void @foo(float* noalias %a, float* noalias %b, float* noalias %c) { +; CHECK-LABEL: @foo( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[B:%.*]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4, !nontemporal !0 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[C:%.*]] to <4 x float>* +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[A:%.*]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 4, !nontemporal !0 +; CHECK-NEXT: ret void +; entry: ; Check that we don't lose !nontemporal hint when vectorizing loads. -; CHECK: %{{[0-9]*}} = load <4 x float>, <4 x float>* %{{[0-9]+}}, align 4, !nontemporal !0 %b1 = load float, float* %b, align 4, !nontemporal !0 %arrayidx.1 = getelementptr inbounds float, float* %b, i64 1 %b2 = load float, float* %arrayidx.1, align 4, !nontemporal !0 @@ -16,7 +26,6 @@ entry: %b4 = load float, float* %arrayidx.3, align 4, !nontemporal !0 ; Check that we don't introduce !nontemporal hint when the original scalar loads didn't have it. -; CHECK: %{{[0-9]*}} = load <4 x float>, <4 x float>* %{{[0-9]+}}, align 4{{$}} %c1 = load float, float* %c, align 4 %arrayidx2.1 = getelementptr inbounds float, float* %c, i64 1 %c2 = load float, float* %arrayidx2.1, align 4 @@ -31,7 +40,6 @@ entry: %a4 = fadd float %b4, %c4 ; Check that we don't lose !nontemporal hint when vectorizing stores. -; CHECK: store <4 x float> %{{[0-9]+}}, <4 x float>* %{{[0-9]+}}, align 4, !nontemporal !0 store float %a1, float* %a, align 4, !nontemporal !0 %arrayidx3.1 = getelementptr inbounds float, float* %a, i64 1 store float %a2, float* %arrayidx3.1, align 4, !nontemporal !0 @@ -40,16 +48,21 @@ entry: %arrayidx3.3 = getelementptr inbounds float, float* %a, i64 3 store float %a4, float* %arrayidx3.3, align 4, !nontemporal !0 -; CHECK: ret void ret void } -; CHECK-LABEL: @foo2 define void @foo2(float* noalias %a, float* noalias %b) { +; CHECK-LABEL: @foo2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[B:%.*]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[A:%.*]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float>* [[TMP2]], align 4 +; CHECK-NEXT: ret void +; entry: ; Check that we don't mark vector load with !nontemporal attribute if some of ; the original scalar loads don't have it. -; CHECK: %{{[0-9]*}} = load <4 x float>, <4 x float>* %{{[0-9]+}}, align 4{{$}} %b1 = load float, float* %b, align 4, !nontemporal !0 %arrayidx.1 = getelementptr inbounds float, float* %b, i64 1 %b2 = load float, float* %arrayidx.1, align 4 @@ -60,7 +73,6 @@ entry: ; Check that we don't mark vector store with !nontemporal attribute if some of ; the original scalar stores don't have it. -; CHECK: store <4 x float> %{{[0-9]+}}, <4 x float>* %{{[0-9]+}}, align 4{{$}} store float %b1, float* %a, align 4, !nontemporal !0 %arrayidx3.1 = getelementptr inbounds float, float* %a, i64 1 store float %b2, float* %arrayidx3.1, align 4 @@ -69,7 +81,6 @@ entry: %arrayidx3.3 = getelementptr inbounds float, float* %a, i64 3 store float %b4, float* %arrayidx3.3, align 4, !nontemporal !0 -; CHECK: ret void ret void } diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/sdiv-pow2.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/sdiv-pow2.ll index 72c7082..8311f20 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/sdiv-pow2.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/sdiv-pow2.ll @@ -1,13 +1,30 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=aarch64-unknown-linux-gnu -mcpu=cortex-a57 | FileCheck %s target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" target triple = "aarch64--linux-gnu" -; CHECK-LABEL: @test1 -; CHECK: load <4 x i32> -; CHECK: add nsw <4 x i32> -; CHECK: sdiv <4 x i32> - define void @test1(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c) { +; CHECK-LABEL: @test1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 1 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 1 +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 1 +; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 2 +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 2 +; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 2 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 3 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[B]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 3 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[C]] to <4 x i32>* +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = sdiv <4 x i32> [[TMP4]], +; CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 3 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[A]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4 +; CHECK-NEXT: ret void +; entry: %0 = load i32, i32* %b, align 4 %1 = load i32, i32* %c, align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/ARM/memory.ll b/llvm/test/Transforms/SLPVectorizer/ARM/memory.ll index 57d7cce..70e8703 100644 --- a/llvm/test/Transforms/SLPVectorizer/ARM/memory.ll +++ b/llvm/test/Transforms/SLPVectorizer/ARM/memory.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=thumbv7-apple-ios3.0.0 -mcpu=swift | FileCheck %s target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32" @@ -5,10 +6,17 @@ target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64- ; On swift unaligned <2 x double> stores need 4uops and it is there for cheaper ; to do this scalar. -; CHECK-LABEL: expensive_double_store -; CHECK-NOT: load <2 x double> -; CHECK-NOT: store <2 x double> define void @expensive_double_store(double* noalias %dst, double* noalias %src, i64 %count) { +; CHECK-LABEL: @expensive_double_store( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load double, double* [[SRC:%.*]], align 8 +; CHECK-NEXT: store double [[TMP0]], double* [[DST:%.*]], align 8 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[SRC]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = load double, double* [[ARRAYIDX2]], align 8 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[DST]], i64 1 +; CHECK-NEXT: store double [[TMP1]], double* [[ARRAYIDX3]], align 8 +; CHECK-NEXT: ret void +; entry: %0 = load double, double* %src, align 8 store double %0, double* %dst, align 8 diff --git a/llvm/test/Transforms/SLPVectorizer/ARM/sroa.ll b/llvm/test/Transforms/SLPVectorizer/ARM/sroa.ll index 65e0260..c43e8f1 100644 --- a/llvm/test/Transforms/SLPVectorizer/ARM/sroa.ll +++ b/llvm/test/Transforms/SLPVectorizer/ARM/sroa.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -mcpu=swift -mtriple=thumbv7-apple-ios -basicaa -slp-vectorizer < %s | FileCheck %s target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32" @@ -8,11 +9,45 @@ target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64- ; because the scalar version of the shl/or are handled by the ; backend and disappear, the vectorized code stays. -; CHECK-LABEL: SROAed -; CHECK-NOT: shl nuw <2 x i64> -; CHECK-NOT: or <2 x i64> - define void @SROAed(%class.Complex* noalias nocapture sret %agg.result, [4 x i32] %a.coerce, [4 x i32] %b.coerce) { +; CHECK-LABEL: @SROAed( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i32] [[A_COERCE:%.*]], 0 +; CHECK-NEXT: [[A_SROA_0_0_INSERT_EXT:%.*]] = zext i32 [[A_COERCE_FCA_0_EXTRACT]] to i64 +; CHECK-NEXT: [[A_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i32] [[A_COERCE]], 1 +; CHECK-NEXT: [[A_SROA_0_4_INSERT_EXT:%.*]] = zext i32 [[A_COERCE_FCA_1_EXTRACT]] to i64 +; CHECK-NEXT: [[A_SROA_0_4_INSERT_SHIFT:%.*]] = shl nuw i64 [[A_SROA_0_4_INSERT_EXT]], 32 +; CHECK-NEXT: [[A_SROA_0_4_INSERT_INSERT:%.*]] = or i64 [[A_SROA_0_4_INSERT_SHIFT]], [[A_SROA_0_0_INSERT_EXT]] +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[A_SROA_0_4_INSERT_INSERT]] to double +; CHECK-NEXT: [[A_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i32] [[A_COERCE]], 2 +; CHECK-NEXT: [[A_SROA_3_8_INSERT_EXT:%.*]] = zext i32 [[A_COERCE_FCA_2_EXTRACT]] to i64 +; CHECK-NEXT: [[A_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i32] [[A_COERCE]], 3 +; CHECK-NEXT: [[A_SROA_3_12_INSERT_EXT:%.*]] = zext i32 [[A_COERCE_FCA_3_EXTRACT]] to i64 +; CHECK-NEXT: [[A_SROA_3_12_INSERT_SHIFT:%.*]] = shl nuw i64 [[A_SROA_3_12_INSERT_EXT]], 32 +; CHECK-NEXT: [[A_SROA_3_12_INSERT_INSERT:%.*]] = or i64 [[A_SROA_3_12_INSERT_SHIFT]], [[A_SROA_3_8_INSERT_EXT]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[A_SROA_3_12_INSERT_INSERT]] to double +; CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x i32] [[B_COERCE:%.*]], 0 +; CHECK-NEXT: [[B_SROA_0_0_INSERT_EXT:%.*]] = zext i32 [[B_COERCE_FCA_0_EXTRACT]] to i64 +; CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x i32] [[B_COERCE]], 1 +; CHECK-NEXT: [[B_SROA_0_4_INSERT_EXT:%.*]] = zext i32 [[B_COERCE_FCA_1_EXTRACT]] to i64 +; CHECK-NEXT: [[B_SROA_0_4_INSERT_SHIFT:%.*]] = shl nuw i64 [[B_SROA_0_4_INSERT_EXT]], 32 +; CHECK-NEXT: [[B_SROA_0_4_INSERT_INSERT:%.*]] = or i64 [[B_SROA_0_4_INSERT_SHIFT]], [[B_SROA_0_0_INSERT_EXT]] +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[B_SROA_0_4_INSERT_INSERT]] to double +; CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x i32] [[B_COERCE]], 2 +; CHECK-NEXT: [[B_SROA_3_8_INSERT_EXT:%.*]] = zext i32 [[B_COERCE_FCA_2_EXTRACT]] to i64 +; CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x i32] [[B_COERCE]], 3 +; CHECK-NEXT: [[B_SROA_3_12_INSERT_EXT:%.*]] = zext i32 [[B_COERCE_FCA_3_EXTRACT]] to i64 +; CHECK-NEXT: [[B_SROA_3_12_INSERT_SHIFT:%.*]] = shl nuw i64 [[B_SROA_3_12_INSERT_EXT]], 32 +; CHECK-NEXT: [[B_SROA_3_12_INSERT_INSERT:%.*]] = or i64 [[B_SROA_3_12_INSERT_SHIFT]], [[B_SROA_3_8_INSERT_EXT]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[B_SROA_3_12_INSERT_INSERT]] to double +; CHECK-NEXT: [[ADD:%.*]] = fadd double [[TMP0]], [[TMP2]] +; CHECK-NEXT: [[ADD3:%.*]] = fadd double [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[RE_I_I:%.*]] = getelementptr inbounds [[CLASS_COMPLEX:%.*]], %class.Complex* [[AGG_RESULT:%.*]], i32 0, i32 0 +; CHECK-NEXT: store double [[ADD]], double* [[RE_I_I]], align 4 +; CHECK-NEXT: [[IM_I_I:%.*]] = getelementptr inbounds [[CLASS_COMPLEX]], %class.Complex* [[AGG_RESULT]], i32 0, i32 1 +; CHECK-NEXT: store double [[ADD3]], double* [[IM_I_I]], align 4 +; CHECK-NEXT: ret void +; entry: %a.coerce.fca.0.extract = extractvalue [4 x i32] %a.coerce, 0 %a.sroa.0.0.insert.ext = zext i32 %a.coerce.fca.0.extract to i64 diff --git a/llvm/test/Transforms/SLPVectorizer/NVPTX/v2f16.ll b/llvm/test/Transforms/SLPVectorizer/NVPTX/v2f16.ll index d8b80f4..7038b0f 100644 --- a/llvm/test/Transforms/SLPVectorizer/NVPTX/v2f16.ll +++ b/llvm/test/Transforms/SLPVectorizer/NVPTX/v2f16.ll @@ -1,20 +1,50 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=nvptx64-nvidia-cuda -mcpu=sm_70 | FileCheck %s ; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=nvptx64-nvidia-cuda -mcpu=sm_40 | FileCheck %s -check-prefix=NOVECTOR -; CHECK-LABEL: @fusion -; CHECK: load <2 x half>, <2 x half>* -; CHECK: fmul fast <2 x half> -; CHECK: fadd fast <2 x half> -; CHECK: store <2 x half> %4, <2 x half> - -; NOVECTOR-LABEL: @fusion -; NOVECTOR: load half -; NOVECTOR: fmul fast half -; NOVECTOR: fadd fast half -; NOVECTOR: fmul fast half -; NOVECTOR: fadd fast half -; NOVECTOR: store half define void @fusion(i8* noalias nocapture align 256 dereferenceable(19267584) %arg, i8* noalias nocapture readonly align 256 dereferenceable(19267584) %arg1, i32 %arg2, i32 %arg3) local_unnamed_addr #0 { +; CHECK-LABEL: @fusion( +; CHECK-NEXT: [[TMP:%.*]] = shl nuw nsw i32 [[ARG2:%.*]], 6 +; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP]], [[ARG3:%.*]] +; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i32 [[TMP4]], 2 +; CHECK-NEXT: [[TMP6:%.*]] = zext i32 [[TMP5]] to i64 +; CHECK-NEXT: [[TMP7:%.*]] = or i64 [[TMP6]], 1 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i8* [[ARG1:%.*]] to half* +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds half, half* [[TMP10]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8* [[ARG:%.*]] to half* +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds half, half* [[TMP15]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds half, half* [[TMP10]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast half* [[TMP11]] to <2 x half>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x half>, <2 x half>* [[TMP1]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <2 x half> , [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = fadd fast <2 x half> , [[TMP3]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds half, half* [[TMP15]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast half* [[TMP16]] to <2 x half>* +; CHECK-NEXT: store <2 x half> [[TMP4]], <2 x half>* [[TMP5]], align 8 +; CHECK-NEXT: ret void +; +; NOVECTOR-LABEL: @fusion( +; NOVECTOR-NEXT: [[TMP:%.*]] = shl nuw nsw i32 [[ARG2:%.*]], 6 +; NOVECTOR-NEXT: [[TMP4:%.*]] = or i32 [[TMP]], [[ARG3:%.*]] +; NOVECTOR-NEXT: [[TMP5:%.*]] = shl nuw nsw i32 [[TMP4]], 2 +; NOVECTOR-NEXT: [[TMP6:%.*]] = zext i32 [[TMP5]] to i64 +; NOVECTOR-NEXT: [[TMP7:%.*]] = or i64 [[TMP6]], 1 +; NOVECTOR-NEXT: [[TMP10:%.*]] = bitcast i8* [[ARG1:%.*]] to half* +; NOVECTOR-NEXT: [[TMP11:%.*]] = getelementptr inbounds half, half* [[TMP10]], i64 [[TMP6]] +; NOVECTOR-NEXT: [[TMP12:%.*]] = load half, half* [[TMP11]], align 8 +; NOVECTOR-NEXT: [[TMP13:%.*]] = fmul fast half [[TMP12]], 0xH5380 +; NOVECTOR-NEXT: [[TMP14:%.*]] = fadd fast half [[TMP13]], 0xH57F0 +; NOVECTOR-NEXT: [[TMP15:%.*]] = bitcast i8* [[ARG:%.*]] to half* +; NOVECTOR-NEXT: [[TMP16:%.*]] = getelementptr inbounds half, half* [[TMP15]], i64 [[TMP6]] +; NOVECTOR-NEXT: store half [[TMP14]], half* [[TMP16]], align 8 +; NOVECTOR-NEXT: [[TMP17:%.*]] = getelementptr inbounds half, half* [[TMP10]], i64 [[TMP7]] +; NOVECTOR-NEXT: [[TMP18:%.*]] = load half, half* [[TMP17]], align 2 +; NOVECTOR-NEXT: [[TMP19:%.*]] = fmul fast half [[TMP18]], 0xH5380 +; NOVECTOR-NEXT: [[TMP20:%.*]] = fadd fast half [[TMP19]], 0xH57F0 +; NOVECTOR-NEXT: [[TMP21:%.*]] = getelementptr inbounds half, half* [[TMP15]], i64 [[TMP7]] +; NOVECTOR-NEXT: store half [[TMP20]], half* [[TMP21]], align 2 +; NOVECTOR-NEXT: ret void +; %tmp = shl nuw nsw i32 %arg2, 6 %tmp4 = or i32 %tmp, %arg3 %tmp5 = shl nuw nsw i32 %tmp4, 2 diff --git a/llvm/test/Transforms/SLPVectorizer/PowerPC/pr27897.ll b/llvm/test/Transforms/SLPVectorizer/PowerPC/pr27897.ll index dabb338..7f7df82 100644 --- a/llvm/test/Transforms/SLPVectorizer/PowerPC/pr27897.ll +++ b/llvm/test/Transforms/SLPVectorizer/PowerPC/pr27897.ll @@ -1,8 +1,28 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -mtriple=powerpc64-linux-gnu -mcpu=pwr8 -mattr=+vsx -slp-vectorizer < %s | FileCheck %s %struct.A = type { i8*, i8* } define i64 @foo(%struct.A* nocapture readonly %this) { +; CHECK-LABEL: @foo( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[END_I:%.*]] = getelementptr inbounds [[STRUCT_A:%.*]], %struct.A* [[THIS:%.*]], i64 0, i32 1 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8** [[END_I]] to i64* +; CHECK-NEXT: [[TMP1:%.*]] = load i64, i64* [[TMP0]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast %struct.A* [[THIS]] to i64* +; CHECK-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP2]], align 8 +; CHECK-NEXT: [[SUB_PTR_SUB_I:%.*]] = sub i64 [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i64 [[SUB_PTR_SUB_I]], 9 +; CHECK-NEXT: br i1 [[CMP]], label [[RETURN:%.*]], label [[LOR_LHS_FALSE:%.*]] +; CHECK: lor.lhs.false: +; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i64 [[TMP3]] to i8* +; CHECK-NEXT: [[TMP5:%.*]] = inttoptr i64 [[TMP1]] to i8* +; CHECK-NEXT: [[CMP2:%.*]] = icmp ugt i8* [[TMP5]], [[TMP4]] +; CHECK-NEXT: [[DOT:%.*]] = select i1 [[CMP2]], i64 2, i64 -1 +; CHECK-NEXT: ret i64 [[DOT]] +; CHECK: return: +; CHECK-NEXT: ret i64 2 +; entry: %end.i = getelementptr inbounds %struct.A, %struct.A* %this, i64 0, i32 1 %0 = bitcast i8** %end.i to i64* @@ -24,6 +44,3 @@ return: ret i64 2 } -; CHECK: load i64 -; CHECK-NOT: load <2 x i64> -; CHECK-NOT: extractelement diff --git a/llvm/test/Transforms/SLPVectorizer/X86/align.ll b/llvm/test/Transforms/SLPVectorizer/X86/align.ll index b74b709..5c7c4ce 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/align.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/align.ll @@ -1,16 +1,31 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" ; Simple 3-pair chain with loads and stores -; CHECK-LABEL: @test1 define void @test1(double* %a, double* %b, double* %c) { +; CHECK-LABEL: @test1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[AGG_TMP_I_I_SROA_0:%.*]] = alloca [3 x double], align 16 +; CHECK-NEXT: [[STORE1:%.*]] = getelementptr inbounds [3 x double], [3 x double]* [[AGG_TMP_I_I_SROA_0]], i64 0, i64 1 +; CHECK-NEXT: [[STORE2:%.*]] = getelementptr inbounds [3 x double], [3 x double]* [[AGG_TMP_I_I_SROA_0]], i64 0, i64 2 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[A]] to <2 x double>* +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 1 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[B]] to <2 x double>* +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[STORE1]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 8 +; CHECK-NEXT: ret void +; entry: %agg.tmp.i.i.sroa.0 = alloca [3 x double], align 16 -; CHECK: %[[V0:[0-9]+]] = load <2 x double>, <2 x double>* %[[V2:[0-9]+]], align 8 - %i0 = load double, double* %a - %i1 = load double, double* %b + %i0 = load double, double* %a + %i1 = load double, double* %b %mul = fmul double %i0, %i1 %store1 = getelementptr inbounds [3 x double], [3 x double]* %agg.tmp.i.i.sroa.0, i64 0, i64 1 %store2 = getelementptr inbounds [3 x double], [3 x double]* %agg.tmp.i.i.sroa.0, i64 0, i64 2 @@ -19,23 +34,29 @@ entry: %arrayidx4 = getelementptr inbounds double, double* %b, i64 1 %i4 = load double, double* %arrayidx4, align 8 %mul5 = fmul double %i3, %i4 -; CHECK: store <2 x double> %[[V1:[0-9]+]], <2 x double>* %[[V2:[0-9]+]], align 8 store double %mul, double* %store1 store double %mul5, double* %store2, align 16 -; CHECK: ret ret void } ; Float has 4 byte abi alignment on x86_64. We must use the alignmnet of the ; value being loaded/stored not the alignment of the pointer type. -; CHECK-LABEL: @test2 -; CHECK-NOT: align 8 -; CHECK: load <4 x float>{{.*}}, align 4 -; CHECK: store <4 x float>{{.*}}, align 4 -; CHECK: ret - define void @test2(float * %a, float * %b) { +; CHECK-LABEL: @test2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A1:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 1 +; CHECK-NEXT: [[A2:%.*]] = getelementptr inbounds float, float* [[A]], i64 2 +; CHECK-NEXT: [[A3:%.*]] = getelementptr inbounds float, float* [[A]], i64 3 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[A]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[B1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1 +; CHECK-NEXT: [[B2:%.*]] = getelementptr inbounds float, float* [[B]], i64 2 +; CHECK-NEXT: [[B3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[B]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP1]], <4 x float>* [[TMP2]], align 4 +; CHECK-NEXT: ret void +; entry: %l0 = load float, float* %a %a1 = getelementptr inbounds float, float* %a, i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/atomics.ll b/llvm/test/Transforms/SLPVectorizer/X86/atomics.ll index a48b793..c7f0549 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/atomics.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/atomics.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -basicaa -slp-vectorizer -S |FileCheck %s target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" @@ -7,16 +8,19 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" ; The SLPVectorizer should not vectorize atomic stores and it should not ; schedule regular stores around atomic stores. -; CHECK-LABEL: test -; CHECK: store i32 -; CHECK: store atomic i32 -; CHECK: store i32 -; CHECK: store atomic i32 -; CHECK: store i32 -; CHECK: store atomic i32 -; CHECK: store i32 -; CHECK: store atomic i32 define void @test() { +; CHECK-LABEL: @test( +; CHECK-NEXT: entry: +; CHECK-NEXT: store i32 0, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i64 0, i64 0), align 16 +; CHECK-NEXT: store atomic i32 0, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @x, i64 0, i64 0) release, align 16 +; CHECK-NEXT: store i32 0, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i64 0, i64 1), align 4 +; CHECK-NEXT: store atomic i32 1, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @x, i64 0, i64 1) release, align 4 +; CHECK-NEXT: store i32 0, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i64 0, i64 2), align 8 +; CHECK-NEXT: store atomic i32 2, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @x, i64 0, i64 2) release, align 8 +; CHECK-NEXT: store i32 0, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i64 0, i64 3), align 4 +; CHECK-NEXT: store atomic i32 3, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @x, i64 0, i64 3) release, align 4 +; CHECK-NEXT: ret void +; entry: store i32 0, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @a, i64 0, i64 0), align 16 store atomic i32 0, i32* getelementptr inbounds ([4 x i32], [4 x i32]* @x, i64 0, i64 0) release, align 16 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bad_types.ll b/llvm/test/Transforms/SLPVectorizer/X86/bad_types.ll index 98c2906..d229acd 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/bad_types.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/bad_types.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -basicaa -slp-vectorizer -S -mcpu=corei7-avx | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" @@ -7,10 +8,17 @@ define void @test1(x86_mmx %a, x86_mmx %b, i64* %ptr) { ; Ensure we can handle x86_mmx values which are primitive and can be bitcast ; with integer types but can't be put into a vector. ; -; CHECK-LABEL: @test1 -; CHECK: store i64 -; CHECK: store i64 -; CHECK: ret void +; CHECK-LABEL: @test1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A_CAST:%.*]] = bitcast x86_mmx [[A:%.*]] to i64 +; CHECK-NEXT: [[B_CAST:%.*]] = bitcast x86_mmx [[B:%.*]] to i64 +; CHECK-NEXT: [[A_AND:%.*]] = and i64 [[A_CAST]], 42 +; CHECK-NEXT: [[B_AND:%.*]] = and i64 [[B_CAST]], 42 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr i64, i64* [[PTR:%.*]], i32 1 +; CHECK-NEXT: store i64 [[A_AND]], i64* [[PTR]] +; CHECK-NEXT: store i64 [[B_AND]], i64* [[GEP]] +; CHECK-NEXT: ret void +; entry: %a.cast = bitcast x86_mmx %a to i64 %b.cast = bitcast x86_mmx %b to i64 @@ -26,10 +34,21 @@ define void @test2(x86_mmx %a, x86_mmx %b) { ; Same as @test1 but using phi-input vectorization instead of store ; vectorization. ; -; CHECK-LABEL: @test2 -; CHECK: and i64 -; CHECK: and i64 -; CHECK: ret void +; CHECK-LABEL: @test2( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 undef, label [[IF_THEN:%.*]], label [[EXIT:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[A_CAST:%.*]] = bitcast x86_mmx [[A:%.*]] to i64 +; CHECK-NEXT: [[B_CAST:%.*]] = bitcast x86_mmx [[B:%.*]] to i64 +; CHECK-NEXT: [[A_AND:%.*]] = and i64 [[A_CAST]], 42 +; CHECK-NEXT: [[B_AND:%.*]] = and i64 [[B_CAST]], 42 +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: [[A_PHI:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[A_AND]], [[IF_THEN]] ] +; CHECK-NEXT: [[B_PHI:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[B_AND]], [[IF_THEN]] ] +; CHECK-NEXT: tail call void @f(i64 [[A_PHI]], i64 [[B_PHI]]) +; CHECK-NEXT: ret void +; entry: br i1 undef, label %if.then, label %exit @@ -50,9 +69,26 @@ exit: define i8 @test3(i8 *%addr) { ; Check that we do not vectorize types that are padded to a bigger ones. ; -; CHECK-LABEL: @test3 -; CHECK-NOT: <4 x i2> -; CHECK: ret i8 +; CHECK-LABEL: @test3( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A:%.*]] = bitcast i8* [[ADDR:%.*]] to i2* +; CHECK-NEXT: [[A0:%.*]] = getelementptr inbounds i2, i2* [[A]], i64 0 +; CHECK-NEXT: [[A1:%.*]] = getelementptr inbounds i2, i2* [[A]], i64 1 +; CHECK-NEXT: [[A2:%.*]] = getelementptr inbounds i2, i2* [[A]], i64 2 +; CHECK-NEXT: [[A3:%.*]] = getelementptr inbounds i2, i2* [[A]], i64 3 +; CHECK-NEXT: [[L0:%.*]] = load i2, i2* [[A0]], align 1 +; CHECK-NEXT: [[L1:%.*]] = load i2, i2* [[A1]], align 1 +; CHECK-NEXT: [[L2:%.*]] = load i2, i2* [[A2]], align 1 +; CHECK-NEXT: [[L3:%.*]] = load i2, i2* [[A3]], align 1 +; CHECK-NEXT: br label [[BB1:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[P0:%.*]] = phi i2 [ [[L0]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[P1:%.*]] = phi i2 [ [[L1]], [[ENTRY]] ] +; CHECK-NEXT: [[P2:%.*]] = phi i2 [ [[L2]], [[ENTRY]] ] +; CHECK-NEXT: [[P3:%.*]] = phi i2 [ [[L3]], [[ENTRY]] ] +; CHECK-NEXT: [[R:%.*]] = zext i2 [[P2]] to i8 +; CHECK-NEXT: ret i8 [[R]] +; entry: %a = bitcast i8* %addr to i2* %a0 = getelementptr inbounds i2, i2* %a, i64 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll b/llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll index 382a43f..7378b8b 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll @@ -1,12 +1,25 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" -;CHECK-LABEL: @foo( -;CHECK: store <4 x i32> -;CHECK: ret define i32 @foo(i32* nocapture %A, i32 %n) { +; CHECK-LABEL: @foo( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CALL:%.*]] = tail call i32 (...) @bar() +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 [[N:%.*]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[N]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[N]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[N]], i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <4 x i32> [[TMP3]], +; CHECK-NEXT: [[TMP5:%.*]] = shl <4 x i32> [[TMP3]], +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> , [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4 +; CHECK-NEXT: ret i32 undef +; entry: %call = tail call i32 (...) @bar() #2 %mul = mul nsw i32 %n, 5 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/call.ll b/llvm/test/Transforms/SLPVectorizer/X86/call.ll index 8397d34..c93397c 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/call.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/call.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -basicaa -slp-vectorizer -slp-threshold=-999 -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" @@ -13,10 +14,10 @@ declare i64 @round(i64) define void @sin_libm(double* %a, double* %b) { ; CHECK-LABEL: @sin_libm( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* %a to <2 x double>* +; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[A:%.*]] to <2 x double>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8 ; CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.sin.v2f64(<2 x double> [[TMP2]]) -; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* %b to <2 x double>* +; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[B:%.*]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8 ; CHECK-NEXT: ret void ; @@ -33,10 +34,10 @@ define void @sin_libm(double* %a, double* %b) { define void @cos_libm(double* %a, double* %b) { ; CHECK-LABEL: @cos_libm( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* %a to <2 x double>* +; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[A:%.*]] to <2 x double>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8 ; CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.cos.v2f64(<2 x double> [[TMP2]]) -; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* %b to <2 x double>* +; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[B:%.*]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8 ; CHECK-NEXT: ret void ; @@ -53,10 +54,10 @@ define void @cos_libm(double* %a, double* %b) { define void @pow_libm(double* %a, double* %b) { ; CHECK-LABEL: @pow_libm( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* %a to <2 x double>* +; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[A:%.*]] to <2 x double>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8 ; CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.pow.v2f64(<2 x double> [[TMP2]], <2 x double> [[TMP2]]) -; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* %b to <2 x double>* +; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[B:%.*]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8 ; CHECK-NEXT: ret void ; @@ -73,10 +74,10 @@ define void @pow_libm(double* %a, double* %b) { define void @exp_libm(double* %a, double* %b) { ; CHECK-LABEL: @exp_libm( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* %a to <2 x double>* +; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[A:%.*]] to <2 x double>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8 ; CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.exp2.v2f64(<2 x double> [[TMP2]]) -; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* %b to <2 x double>* +; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[B:%.*]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8 ; CHECK-NEXT: ret void ; @@ -91,15 +92,15 @@ define void @exp_libm(double* %a, double* %b) { ret void } -; No fast-math-flags are required to convert sqrt library calls to an intrinsic. +; No fast-math-flags are required to convert sqrt library calls to an intrinsic. ; We just need to know that errno is not set (readnone). define void @sqrt_libm_no_errno(double* %a, double* %b) { ; CHECK-LABEL: @sqrt_libm_no_errno( -; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* %a to <2 x double>* +; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[A:%.*]] to <2 x double>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8 ; CHECK-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP2]]) -; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* %b to <2 x double>* +; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[B:%.*]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8 ; CHECK-NEXT: ret void ; @@ -116,18 +117,18 @@ define void @sqrt_libm_no_errno(double* %a, double* %b) { ; The sqrt intrinsic does not set errno, but a non-constant sqrt call might, so this can't vectorize. ; The nnan on the call does not matter because there's no guarantee in the C standard that a negative -; input would result in a nan output ("On a domain error, the function returns an +; input would result in a nan output ("On a domain error, the function returns an ; implementation-defined value.") define void @sqrt_libm_errno(double* %a, double* %b) { ; CHECK-LABEL: @sqrt_libm_errno( -; CHECK-NEXT: [[A0:%.*]] = load double, double* %a, align 8 -; CHECK-NEXT: [[IDX1:%.*]] = getelementptr inbounds double, double* %a, i64 1 +; CHECK-NEXT: [[A0:%.*]] = load double, double* [[A:%.*]], align 8 +; CHECK-NEXT: [[IDX1:%.*]] = getelementptr inbounds double, double* [[A]], i64 1 ; CHECK-NEXT: [[A1:%.*]] = load double, double* [[IDX1]], align 8 ; CHECK-NEXT: [[SQRT1:%.*]] = tail call nnan double @sqrt(double [[A0]]) #2 ; CHECK-NEXT: [[SQRT2:%.*]] = tail call nnan double @sqrt(double [[A1]]) #2 -; CHECK-NEXT: store double [[SQRT1]], double* %b, align 8 -; CHECK-NEXT: [[IDX2:%.*]] = getelementptr inbounds double, double* %b, i64 1 +; CHECK-NEXT: store double [[SQRT1]], double* [[B:%.*]], align 8 +; CHECK-NEXT: [[IDX2:%.*]] = getelementptr inbounds double, double* [[B]], i64 1 ; CHECK-NEXT: store double [[SQRT2]], double* [[IDX2]], align 8 ; CHECK-NEXT: ret void ; @@ -145,13 +146,13 @@ define void @sqrt_libm_errno(double* %a, double* %b) { ; Negative test case define void @round_custom(i64* %a, i64* %b) { ; CHECK-LABEL: @round_custom( -; CHECK-NEXT: [[A0:%.*]] = load i64, i64* %a, align 8 -; CHECK-NEXT: [[IDX1:%.*]] = getelementptr inbounds i64, i64* %a, i64 1 +; CHECK-NEXT: [[A0:%.*]] = load i64, i64* [[A:%.*]], align 8 +; CHECK-NEXT: [[IDX1:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 1 ; CHECK-NEXT: [[A1:%.*]] = load i64, i64* [[IDX1]], align 8 ; CHECK-NEXT: [[ROUND1:%.*]] = tail call i64 @round(i64 [[A0]]) #3 ; CHECK-NEXT: [[ROUND2:%.*]] = tail call i64 @round(i64 [[A1]]) #3 -; CHECK-NEXT: store i64 [[ROUND1]], i64* %b, align 8 -; CHECK-NEXT: [[IDX2:%.*]] = getelementptr inbounds i64, i64* %b, i64 1 +; CHECK-NEXT: store i64 [[ROUND1]], i64* [[B:%.*]], align 8 +; CHECK-NEXT: [[IDX2:%.*]] = getelementptr inbounds i64, i64* [[B]], i64 1 ; CHECK-NEXT: store i64 [[ROUND2]], i64* [[IDX2]], align 8 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll b/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll index 2798ccb..9af59ef 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -slp-vectorizer < %s -S | FileCheck %s ; Verify that the SLP vectorizer is able to figure out that commutativity @@ -16,9 +17,31 @@ target triple = "x86_64-apple-macosx10.11.0" ; Check that we correctly detect a splat/broadcast by leveraging the ; commutativity property of `xor`. -; CHECK-LABEL: @splat -; CHECK: store <16 x i8> define void @splat(i8 %a, i8 %b, i8 %c) { +; CHECK-LABEL: @splat( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i8> undef, i8 [[C:%.*]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i8> [[TMP1]], i8 [[C]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> [[TMP2]], i8 [[C]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x i8> [[TMP3]], i8 [[C]], i32 3 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i8> [[TMP4]], i8 [[C]], i32 4 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i8> [[TMP5]], i8 [[C]], i32 5 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x i8> [[TMP6]], i8 [[C]], i32 6 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x i8> [[TMP7]], i8 [[C]], i32 7 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x i8> [[TMP8]], i8 [[C]], i32 8 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x i8> [[TMP9]], i8 [[C]], i32 9 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x i8> [[TMP10]], i8 [[C]], i32 10 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <16 x i8> [[TMP11]], i8 [[C]], i32 11 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x i8> [[TMP12]], i8 [[C]], i32 12 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <16 x i8> [[TMP13]], i8 [[C]], i32 13 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x i8> [[TMP14]], i8 [[C]], i32 14 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <16 x i8> [[TMP15]], i8 [[C]], i32 15 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <2 x i8> undef, i8 [[A:%.*]], i32 0 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <2 x i8> [[TMP17]], i8 [[B:%.*]], i32 1 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i8> [[TMP18]], <2 x i8> undef, <16 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = xor <16 x i8> [[TMP16]], [[SHUFFLE]] +; CHECK-NEXT: store <16 x i8> [[TMP19]], <16 x i8>* bitcast ([32 x i8]* @cle to <16 x i8>*), align 16 +; CHECK-NEXT: ret void +; %1 = xor i8 %c, %a store i8 %1, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @cle, i64 0, i64 0), align 16 %2 = xor i8 %a, %c @@ -59,9 +82,24 @@ define void @splat(i8 %a, i8 %b, i8 %c) { ; Check that we correctly detect that we can have the same opcode on one side by ; leveraging the commutativity property of `xor`. -; CHECK-LABEL: @same_opcode_on_one_side -; CHECK: store <4 x i32> define void @same_opcode_on_one_side(i32 %a, i32 %b, i32 %c) { +; CHECK-LABEL: @same_opcode_on_one_side( +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> undef, i32 [[C:%.*]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[C]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[C]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[C]], i32 3 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> undef, i32 [[A:%.*]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[A]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[A]], i32 2 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[A]], i32 3 +; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP4]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[B:%.*]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[C]], i32 2 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[A]], i32 3 +; CHECK-NEXT: [[TMP13:%.*]] = xor <4 x i32> [[TMP12]], [[TMP9]] +; CHECK-NEXT: store <4 x i32> [[TMP13]], <4 x i32>* bitcast ([32 x i32]* @cle32 to <4 x i32>*), align 16 +; CHECK-NEXT: ret void +; %add1 = add i32 %c, %a %add2 = add i32 %c, %a %add3 = add i32 %a, %c diff --git a/llvm/test/Transforms/SLPVectorizer/X86/consecutive-access.ll b/llvm/test/Transforms/SLPVectorizer/X86/consecutive-access.ll index aa682fc..f394dc7 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/consecutive-access.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/consecutive-access.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -basicaa -slp-vectorizer -S | FileCheck %s target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.9.0" @@ -11,10 +12,38 @@ target triple = "x86_64-apple-macosx10.9.0" ; A[3*i], A[3*i+1] and A[3*i+2] are consecutive, but in future ; that would hopefully be fixed. For now, check that this isn't ; vectorized. -; CHECK-LABEL: foo_3double -; CHECK-NOT: x double> ; Function Attrs: nounwind ssp uwtable define void @foo_3double(i32 %u) #0 { +; CHECK-LABEL: @foo_3double( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[U_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store i32 [[U:%.*]], i32* [[U_ADDR]], align 4 +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[U]], 3 +; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[MUL]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @A, i32 0, i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP0:%.*]] = load double, double* [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @B, i32 0, i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP1:%.*]] = load double, double* [[ARRAYIDX4]], align 8 +; CHECK-NEXT: [[ADD5:%.*]] = fadd double [[TMP0]], [[TMP1]] +; CHECK-NEXT: store double [[ADD5]], double* [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[ADD11:%.*]] = add nsw i32 [[MUL]], 1 +; CHECK-NEXT: [[IDXPROM12:%.*]] = sext i32 [[ADD11]] to i64 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @A, i32 0, i64 [[IDXPROM12]] +; CHECK-NEXT: [[TMP2:%.*]] = load double, double* [[ARRAYIDX13]], align 8 +; CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @B, i32 0, i64 [[IDXPROM12]] +; CHECK-NEXT: [[TMP3:%.*]] = load double, double* [[ARRAYIDX17]], align 8 +; CHECK-NEXT: [[ADD18:%.*]] = fadd double [[TMP2]], [[TMP3]] +; CHECK-NEXT: store double [[ADD18]], double* [[ARRAYIDX13]], align 8 +; CHECK-NEXT: [[ADD24:%.*]] = add nsw i32 [[MUL]], 2 +; CHECK-NEXT: [[IDXPROM25:%.*]] = sext i32 [[ADD24]] to i64 +; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @A, i32 0, i64 [[IDXPROM25]] +; CHECK-NEXT: [[TMP4:%.*]] = load double, double* [[ARRAYIDX26]], align 8 +; CHECK-NEXT: [[ARRAYIDX30:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @B, i32 0, i64 [[IDXPROM25]] +; CHECK-NEXT: [[TMP5:%.*]] = load double, double* [[ARRAYIDX30]], align 8 +; CHECK-NEXT: [[ADD31:%.*]] = fadd double [[TMP4]], [[TMP5]] +; CHECK-NEXT: store double [[ADD31]], double* [[ARRAYIDX26]], align 8 +; CHECK-NEXT: ret void +; entry: %u.addr = alloca i32, align 4 store i32 %u, i32* %u.addr, align 4 @@ -48,10 +77,29 @@ entry: ; SCEV should be able to tell that accesses A[C1 + C2*i], A[C1 + C2*i], ... ; A[C1 + C2*i] are consecutive, if C2 is a power of 2, and C2 > C1 > 0. ; Thus, the following code should be vectorized. -; CHECK-LABEL: foo_2double -; CHECK: x double> ; Function Attrs: nounwind ssp uwtable define void @foo_2double(i32 %u) #0 { +; CHECK-LABEL: @foo_2double( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[U_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store i32 [[U:%.*]], i32* [[U_ADDR]], align 4 +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[U]], 2 +; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[MUL]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @A, i32 0, i64 [[IDXPROM]] +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @B, i32 0, i64 [[IDXPROM]] +; CHECK-NEXT: [[ADD11:%.*]] = add nsw i32 [[MUL]], 1 +; CHECK-NEXT: [[IDXPROM12:%.*]] = sext i32 [[ADD11]] to i64 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @A, i32 0, i64 [[IDXPROM12]] +; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>* +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 +; CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @B, i32 0, i64 [[IDXPROM12]] +; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[ARRAYIDX4]] to <2 x double>* +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 8 +; CHECK-NEXT: ret void +; entry: %u.addr = alloca i32, align 4 store i32 %u, i32* %u.addr, align 4 @@ -75,10 +123,37 @@ entry: } ; Similar to the previous test, but with different datatype. -; CHECK-LABEL: foo_4float -; CHECK: x float> ; Function Attrs: nounwind ssp uwtable define void @foo_4float(i32 %u) #0 { +; CHECK-LABEL: @foo_4float( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[U_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store i32 [[U:%.*]], i32* [[U_ADDR]], align 4 +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[U]], 4 +; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[MUL]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2000 x float], [2000 x float]* @C, i32 0, i64 [[IDXPROM]] +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [2000 x float], [2000 x float]* @D, i32 0, i64 [[IDXPROM]] +; CHECK-NEXT: [[ADD11:%.*]] = add nsw i32 [[MUL]], 1 +; CHECK-NEXT: [[IDXPROM12:%.*]] = sext i32 [[ADD11]] to i64 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [2000 x float], [2000 x float]* @C, i32 0, i64 [[IDXPROM12]] +; CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds [2000 x float], [2000 x float]* @D, i32 0, i64 [[IDXPROM12]] +; CHECK-NEXT: [[ADD24:%.*]] = add nsw i32 [[MUL]], 2 +; CHECK-NEXT: [[IDXPROM25:%.*]] = sext i32 [[ADD24]] to i64 +; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds [2000 x float], [2000 x float]* @C, i32 0, i64 [[IDXPROM25]] +; CHECK-NEXT: [[ARRAYIDX30:%.*]] = getelementptr inbounds [2000 x float], [2000 x float]* @D, i32 0, i64 [[IDXPROM25]] +; CHECK-NEXT: [[ADD37:%.*]] = add nsw i32 [[MUL]], 3 +; CHECK-NEXT: [[IDXPROM38:%.*]] = sext i32 [[ADD37]] to i64 +; CHECK-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds [2000 x float], [2000 x float]* @C, i32 0, i64 [[IDXPROM38]] +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[ARRAYIDX43:%.*]] = getelementptr inbounds [2000 x float], [2000 x float]* @D, i32 0, i64 [[IDXPROM38]] +; CHECK-NEXT: [[TMP2:%.*]] = bitcast float* [[ARRAYIDX4]] to <4 x float>* +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 4 +; CHECK-NEXT: ret void +; entry: %u.addr = alloca i32, align 4 store i32 %u, i32* %u.addr, align 4 @@ -118,10 +193,51 @@ entry: } ; Similar to the previous tests, but now we are dealing with AddRec SCEV. -; CHECK-LABEL: foo_loop -; CHECK: x double> ; Function Attrs: nounwind ssp uwtable define i32 @foo_loop(double* %A, i32 %n) #0 { +; CHECK-LABEL: @foo_loop( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A_ADDR:%.*]] = alloca double*, align 8 +; CHECK-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[SUM:%.*]] = alloca double, align 8 +; CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store double* [[A:%.*]], double** [[A_ADDR]], align 8 +; CHECK-NEXT: store i32 [[N:%.*]], i32* [[N_ADDR]], align 4 +; CHECK-NEXT: store double 0.000000e+00, double* [[SUM]], align 8 +; CHECK-NEXT: store i32 0, i32* [[I]], align 4 +; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 0, [[N]] +; CHECK-NEXT: br i1 [[CMP1]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] +; CHECK: for.body.lr.ph: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi double [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[ADD7:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 2 +; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[MUL]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[IDXPROM]] +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[MUL]], 1 +; CHECK-NEXT: [[IDXPROM3:%.*]] = sext i32 [[ADD]] to i64 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[IDXPROM3]] +; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>* +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> , [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP4]], i32 1 +; CHECK-NEXT: [[ADD6:%.*]] = fadd double [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[ADD7]] = fadd double [[TMP1]], [[ADD6]] +; CHECK-NEXT: store double [[ADD7]], double* [[SUM]], align 8 +; CHECK-NEXT: [[INC]] = add nsw i32 [[TMP0]], 1 +; CHECK-NEXT: store i32 [[INC]], i32* [[I]], align 4 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]] +; CHECK: for.cond.for.end_crit_edge: +; CHECK-NEXT: [[SPLIT:%.*]] = phi double [ [[ADD7]], [[FOR_BODY]] ] +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi double [ [[SPLIT]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[CONV:%.*]] = fptosi double [[DOTLCSSA]] to i32 +; CHECK-NEXT: ret i32 [[CONV]] +; entry: %A.addr = alloca double*, align 8 %n.addr = alloca i32, align 4 @@ -170,11 +286,30 @@ for.end: ; preds = %for.cond.for.end_cr ; Similar to foo_2double but with a non-power-of-2 factor and potential ; wrapping (both indices wrap or both don't in the same time) -; CHECK-LABEL: foo_2double_non_power_of_2 -; CHECK: load <2 x double> -; CHECK: load <2 x double> ; Function Attrs: nounwind ssp uwtable define void @foo_2double_non_power_of_2(i32 %u) #0 { +; CHECK-LABEL: @foo_2double_non_power_of_2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[U_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store i32 [[U:%.*]], i32* [[U_ADDR]], align 4 +; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[U]], 6 +; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[MUL]], 6 +; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[ADD6]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @A, i32 0, i64 [[IDXPROM]] +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @B, i32 0, i64 [[IDXPROM]] +; CHECK-NEXT: [[ADD7:%.*]] = add i32 [[MUL]], 7 +; CHECK-NEXT: [[IDXPROM12:%.*]] = sext i32 [[ADD7]] to i64 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @A, i32 0, i64 [[IDXPROM12]] +; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>* +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 +; CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @B, i32 0, i64 [[IDXPROM12]] +; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[ARRAYIDX4]] to <2 x double>* +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 8 +; CHECK-NEXT: ret void +; entry: %u.addr = alloca i32, align 4 store i32 %u, i32* %u.addr, align 4 @@ -199,11 +334,30 @@ entry: } ; Similar to foo_2double_non_power_of_2 but with zext's instead of sext's -; CHECK-LABEL: foo_2double_non_power_of_2_zext -; CHECK: load <2 x double> -; CHECK: load <2 x double> ; Function Attrs: nounwind ssp uwtable define void @foo_2double_non_power_of_2_zext(i32 %u) #0 { +; CHECK-LABEL: @foo_2double_non_power_of_2_zext( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[U_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store i32 [[U:%.*]], i32* [[U_ADDR]], align 4 +; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[U]], 6 +; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[MUL]], 6 +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[ADD6]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @A, i32 0, i64 [[IDXPROM]] +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @B, i32 0, i64 [[IDXPROM]] +; CHECK-NEXT: [[ADD7:%.*]] = add i32 [[MUL]], 7 +; CHECK-NEXT: [[IDXPROM12:%.*]] = zext i32 [[ADD7]] to i64 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @A, i32 0, i64 [[IDXPROM12]] +; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>* +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 +; CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds [2000 x double], [2000 x double]* @B, i32 0, i64 [[IDXPROM12]] +; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[ARRAYIDX4]] to <2 x double>* +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 8 +; CHECK-NEXT: ret void +; entry: %u.addr = alloca i32, align 4 store i32 %u, i32* %u.addr, align 4 @@ -230,10 +384,52 @@ entry: ; Similar to foo_2double_non_power_of_2, but now we are dealing with AddRec SCEV. ; Alternatively, this is like foo_loop, but with a non-power-of-2 factor and ; potential wrapping (both indices wrap or both don't in the same time) -; CHECK-LABEL: foo_loop_non_power_of_2 -; CHECK: <2 x double> ; Function Attrs: nounwind ssp uwtable define i32 @foo_loop_non_power_of_2(double* %A, i32 %n) #0 { +; CHECK-LABEL: @foo_loop_non_power_of_2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A_ADDR:%.*]] = alloca double*, align 8 +; CHECK-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +; CHECK-NEXT: [[SUM:%.*]] = alloca double, align 8 +; CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +; CHECK-NEXT: store double* [[A:%.*]], double** [[A_ADDR]], align 8 +; CHECK-NEXT: store i32 [[N:%.*]], i32* [[N_ADDR]], align 4 +; CHECK-NEXT: store double 0.000000e+00, double* [[SUM]], align 8 +; CHECK-NEXT: store i32 0, i32* [[I]], align 4 +; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 0, [[N]] +; CHECK-NEXT: br i1 [[CMP1]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]] +; CHECK: for.body.lr.ph: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi double [ 0.000000e+00, [[FOR_BODY_LR_PH]] ], [ [[ADD7:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[TMP0]], 12 +; CHECK-NEXT: [[ADD_5:%.*]] = add i32 [[MUL]], 5 +; CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[ADD_5]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[IDXPROM]] +; CHECK-NEXT: [[ADD_6:%.*]] = add i32 [[MUL]], 6 +; CHECK-NEXT: [[IDXPROM3:%.*]] = sext i32 [[ADD_6]] to i64 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[IDXPROM3]] +; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>* +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> , [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP4]], i32 1 +; CHECK-NEXT: [[ADD6:%.*]] = fadd double [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[ADD7]] = fadd double [[TMP1]], [[ADD6]] +; CHECK-NEXT: store double [[ADD7]], double* [[SUM]], align 8 +; CHECK-NEXT: [[INC]] = add i32 [[TMP0]], 1 +; CHECK-NEXT: store i32 [[INC]], i32* [[I]], align 4 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_FOR_END_CRIT_EDGE:%.*]] +; CHECK: for.cond.for.end_crit_edge: +; CHECK-NEXT: [[SPLIT:%.*]] = phi double [ [[ADD7]], [[FOR_BODY]] ] +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi double [ [[SPLIT]], [[FOR_COND_FOR_END_CRIT_EDGE]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[CONV:%.*]] = fptosi double [[DOTLCSSA]] to i32 +; CHECK-NEXT: ret i32 [[CONV]] +; entry: %A.addr = alloca double*, align 8 %n.addr = alloca i32, align 4 @@ -299,9 +495,32 @@ for.end: ; preds = %for.cond.for.end_cr ; ; Make sure we are able to vectorize this from now on: ; -; CHECK-LABEL: @bar -; CHECK: load <2 x double> define double @bar(double* nocapture readonly %a, i32 %n) local_unnamed_addr #0 { +; CHECK-LABEL: @bar( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP15:%.*]] = icmp eq i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP15]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x double> [ zeroinitializer, [[ENTRY:%.*]] ], [ [[TMP6:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP0]], i32 1 +; CHECK-NEXT: [[MUL:%.*]] = fmul double [[TMP1]], [[TMP2]] +; CHECK-NEXT: ret double [[MUL]] +; CHECK: for.body: +; CHECK-NEXT: [[I_018:%.*]] = phi i32 [ [[ADD5:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x double> [ [[TMP6]], [[FOR_BODY]] ], [ zeroinitializer, [[ENTRY]] ] +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_018]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 [[IDXPROM]] +; CHECK-NEXT: [[ADD1:%.*]] = or i32 [[I_018]], 1 +; CHECK-NEXT: [[IDXPROM2:%.*]] = zext i32 [[ADD1]] to i64 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[IDXPROM2]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>* +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8 +; CHECK-NEXT: [[TMP6]] = fadd <2 x double> [[TMP3]], [[TMP5]] +; CHECK-NEXT: [[ADD5]] = add i32 [[I_018]], 2 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD5]], [[N]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]] +; entry: %cmp15 = icmp eq i32 %n, 0 br i1 %cmp15, label %for.cond.cleanup, label %for.body diff --git a/llvm/test/Transforms/SLPVectorizer/X86/continue_vectorizing.ll b/llvm/test/Transforms/SLPVectorizer/X86/continue_vectorizing.ll index ecae70e..060cb05 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/continue_vectorizing.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/continue_vectorizing.ll @@ -1,13 +1,32 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" ; We will keep trying to vectorize the basic block even we already find vectorized store. -; CHECK: test1 -; CHECK: store <2 x double> -; CHECK: ret define void @test1(double* %a, double* %b, double* %c, double* %d) { +; CHECK-LABEL: @test1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[A]] to <2 x double>* +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds double, double* [[B:%.*]], i64 1 +; CHECK-NEXT: [[TMP2:%.*]] = bitcast double* [[B]] to <2 x double>* +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, <2 x double>* [[TMP2]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[C:%.*]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[C]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[A]] to <4 x i32>* +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast double* [[B]] to <4 x i32>* +; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[TMP8]], align 8 +; CHECK-NEXT: [[TMP10:%.*]] = mul <4 x i32> [[TMP7]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast double* [[D:%.*]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP11]], align 8 +; CHECK-NEXT: ret void +; entry: %i0 = load double, double* %a, align 8 %i1 = load double, double* %b, align 8 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_7zip.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_7zip.ll index 54e5d5a..e7bff49 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_7zip.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_7zip.ll @@ -1,4 +1,5 @@ -; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" @@ -7,6 +8,32 @@ target triple = "x86_64-apple-macosx10.8.0" %struct._CLzmaProps.0.27.54.81.102.123.144.165.180.195.228.258.333 = type { i32, i32, i32, i32 } define fastcc void @LzmaDec_DecodeReal2(%struct.CLzmaDec.1.28.55.82.103.124.145.166.181.196.229.259.334* %p) { +; CHECK-LABEL: @LzmaDec_DecodeReal2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RANGE20_I:%.*]] = getelementptr inbounds [[STRUCT_CLZMADEC_1_28_55_82_103_124_145_166_181_196_229_259_334:%.*]], %struct.CLzmaDec.1.28.55.82.103.124.145.166.181.196.229.259.334* [[P:%.*]], i64 0, i32 4 +; CHECK-NEXT: [[CODE21_I:%.*]] = getelementptr inbounds [[STRUCT_CLZMADEC_1_28_55_82_103_124_145_166_181_196_229_259_334]], %struct.CLzmaDec.1.28.55.82.103.124.145.166.181.196.229.259.334* [[P]], i64 0, i32 5 +; CHECK-NEXT: br label [[DO_BODY66_I:%.*]] +; CHECK: do.body66.i: +; CHECK-NEXT: [[RANGE_2_I:%.*]] = phi i32 [ [[RANGE_4_I:%.*]], [[DO_COND_I:%.*]] ], [ undef, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[CODE_2_I:%.*]] = phi i32 [ [[CODE_4_I:%.*]], [[DO_COND_I]] ], [ undef, [[ENTRY]] ] +; CHECK-NEXT: [[DOTRANGE_2_I:%.*]] = select i1 undef, i32 undef, i32 [[RANGE_2_I]] +; CHECK-NEXT: [[DOTCODE_2_I:%.*]] = select i1 undef, i32 undef, i32 [[CODE_2_I]] +; CHECK-NEXT: br i1 undef, label [[DO_COND_I]], label [[IF_ELSE_I:%.*]] +; CHECK: if.else.i: +; CHECK-NEXT: [[SUB91_I:%.*]] = sub i32 [[DOTRANGE_2_I]], undef +; CHECK-NEXT: [[SUB92_I:%.*]] = sub i32 [[DOTCODE_2_I]], undef +; CHECK-NEXT: br label [[DO_COND_I]] +; CHECK: do.cond.i: +; CHECK-NEXT: [[RANGE_4_I]] = phi i32 [ [[SUB91_I]], [[IF_ELSE_I]] ], [ undef, [[DO_BODY66_I]] ] +; CHECK-NEXT: [[CODE_4_I]] = phi i32 [ [[SUB92_I]], [[IF_ELSE_I]] ], [ [[DOTCODE_2_I]], [[DO_BODY66_I]] ] +; CHECK-NEXT: br i1 undef, label [[DO_BODY66_I]], label [[DO_END1006_I:%.*]] +; CHECK: do.end1006.i: +; CHECK-NEXT: [[DOTRANGE_4_I:%.*]] = select i1 undef, i32 undef, i32 [[RANGE_4_I]] +; CHECK-NEXT: [[DOTCODE_4_I:%.*]] = select i1 undef, i32 undef, i32 [[CODE_4_I]] +; CHECK-NEXT: store i32 [[DOTRANGE_4_I]], i32* [[RANGE20_I]], align 4 +; CHECK-NEXT: store i32 [[DOTCODE_4_I]], i32* [[CODE21_I]], align 4 +; CHECK-NEXT: ret void +; entry: %range20.i = getelementptr inbounds %struct.CLzmaDec.1.28.55.82.103.124.145.166.181.196.229.259.334, %struct.CLzmaDec.1.28.55.82.103.124.145.166.181.196.229.259.334* %p, i64 0, i32 4 %code21.i = getelementptr inbounds %struct.CLzmaDec.1.28.55.82.103.124.145.166.181.196.229.259.334, %struct.CLzmaDec.1.28.55.82.103.124.145.166.181.196.229.259.334* %p, i64 0, i32 5 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_binaryop.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_binaryop.ll index 9046c35..eec5373 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_binaryop.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_binaryop.ll @@ -1,4 +1,5 @@ -; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -basicaa -slp-vectorizer -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 | FileCheck %s target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-apple-darwin13.3.0" @@ -6,6 +7,28 @@ target triple = "x86_64-apple-darwin13.3.0" @a = common global double 0.000000e+00, align 8 define i32 @fn1() { +; CHECK-LABEL: @fn1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[INIT:%.*]] = load double, double* @a, align 8 +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[PHI:%.*]] = phi double [ [[ADD2:%.*]], [[LOOP]] ], [ [[INIT]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[POSTADD1_PHI:%.*]] = phi double [ [[POSTADD1:%.*]], [[LOOP]] ], [ [[INIT]], [[ENTRY]] ] +; CHECK-NEXT: [[POSTADD2_PHI:%.*]] = phi double [ [[POSTADD2:%.*]], [[LOOP]] ], [ [[INIT]], [[ENTRY]] ] +; CHECK-NEXT: [[ADD1:%.*]] = fadd double [[POSTADD1_PHI]], undef +; CHECK-NEXT: [[ADD2]] = fadd double [[POSTADD2_PHI]], [[PHI]] +; CHECK-NEXT: [[MUL2:%.*]] = fmul double [[ADD2]], 0.000000e+00 +; CHECK-NEXT: [[BINARYOP_B:%.*]] = fadd double [[POSTADD1_PHI]], [[MUL2]] +; CHECK-NEXT: [[MUL1:%.*]] = fmul double [[ADD1]], 0.000000e+00 +; CHECK-NEXT: [[TMP:%.*]] = fadd double [[POSTADD2_PHI]], 0.000000e+00 +; CHECK-NEXT: [[BINARY_V:%.*]] = fadd double [[MUL1]], [[BINARYOP_B]] +; CHECK-NEXT: [[POSTADD1]] = fadd double [[BINARY_V]], 0.000000e+00 +; CHECK-NEXT: [[POSTADD2]] = fadd double [[TMP]], 1.000000e+00 +; CHECK-NEXT: [[TOBOOL:%.*]] = fcmp une double [[POSTADD1]], 0.000000e+00 +; CHECK-NEXT: br i1 [[TOBOOL]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK: exit: +; CHECK-NEXT: ret i32 1 +; entry: %init = load double, double* @a, align 8 br label %loop diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet.ll index 1bad671..a1a3f50 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet.ll @@ -1,4 +1,5 @@ -; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" @@ -6,6 +7,32 @@ target triple = "x86_64-apple-macosx10.8.0" %"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960" = type { i32, i32 } define void @_ZN23btGeneric6DofConstraint8getInfo1EPN17btTypedConstraint17btConstraintInfo1E(%"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960"* nocapture %info) { +; CHECK-LABEL: @_ZN23btGeneric6DofConstraint8getInfo1EPN17btTypedConstraint17btConstraintInfo1E( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 undef, label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: ret void +; CHECK: if.else: +; CHECK-NEXT: [[M_NUMCONSTRAINTROWS4:%.*]] = getelementptr inbounds %"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960", %"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960"* [[INFO:%.*]], i64 0, i32 0 +; CHECK-NEXT: [[NUB5:%.*]] = getelementptr inbounds %"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960", %"struct.btTypedConstraint::btConstraintInfo1.17.157.357.417.477.960"* [[INFO]], i64 0, i32 1 +; CHECK-NEXT: br i1 undef, label [[LAND_LHS_TRUE_I_1:%.*]], label [[IF_THEN7_1:%.*]] +; CHECK: land.lhs.true.i.1: +; CHECK-NEXT: br i1 undef, label [[FOR_INC_1:%.*]], label [[IF_THEN7_1]] +; CHECK: if.then7.1: +; CHECK-NEXT: [[INC_1:%.*]] = add nsw i32 0, 1 +; CHECK-NEXT: store i32 [[INC_1]], i32* [[M_NUMCONSTRAINTROWS4]], align 4 +; CHECK-NEXT: [[DEC_1:%.*]] = add nsw i32 6, -1 +; CHECK-NEXT: store i32 [[DEC_1]], i32* [[NUB5]], align 4 +; CHECK-NEXT: br label [[FOR_INC_1]] +; CHECK: for.inc.1: +; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[DEC_1]], [[IF_THEN7_1]] ], [ 6, [[LAND_LHS_TRUE_I_1]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ [[INC_1]], [[IF_THEN7_1]] ], [ 0, [[LAND_LHS_TRUE_I_1]] ] +; CHECK-NEXT: [[INC_2:%.*]] = add nsw i32 [[TMP1]], 1 +; CHECK-NEXT: store i32 [[INC_2]], i32* [[M_NUMCONSTRAINTROWS4]], align 4 +; CHECK-NEXT: [[DEC_2:%.*]] = add nsw i32 [[TMP0]], -1 +; CHECK-NEXT: store i32 [[DEC_2]], i32* [[NUB5]], align 4 +; CHECK-NEXT: unreachable +; entry: br i1 undef, label %if.else, label %if.then @@ -42,6 +69,30 @@ for.inc.1: ; preds = %if.then7.1, %land.l %class.btVector4.7.32.67.92.117.142.177.187.262.282.331 = type { %class.btVector3.5.30.65.90.115.140.175.185.260.280.330 } define void @_ZN30GIM_TRIANGLE_CALCULATION_CACHE18triangle_collisionERK9btVector3S2_S2_fS2_S2_S2_fR25GIM_TRIANGLE_CONTACT_DATA(%class.GIM_TRIANGLE_CALCULATION_CACHE.9.34.69.94.119.144.179.189.264.284.332* %this) { +; CHECK-LABEL: @_ZN30GIM_TRIANGLE_CALCULATION_CACHE18triangle_collisionERK9btVector3S2_S2_fS2_S2_S2_fR25GIM_TRIANGLE_CONTACT_DATA( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds [[CLASS_GIM_TRIANGLE_CALCULATION_CACHE_9_34_69_94_119_144_179_189_264_284_332:%.*]], %class.GIM_TRIANGLE_CALCULATION_CACHE.9.34.69.94.119.144.179.189.264.284.332* [[THIS:%.*]], i64 0, i32 2, i64 0, i32 0, i64 1 +; CHECK-NEXT: [[ARRAYIDX36:%.*]] = getelementptr inbounds [[CLASS_GIM_TRIANGLE_CALCULATION_CACHE_9_34_69_94_119_144_179_189_264_284_332]], %class.GIM_TRIANGLE_CALCULATION_CACHE.9.34.69.94.119.144.179.189.264.284.332* [[THIS]], i64 0, i32 2, i64 0, i32 0, i64 2 +; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[ARRAYIDX36]], align 4 +; CHECK-NEXT: [[ADD587:%.*]] = fadd float undef, undef +; CHECK-NEXT: [[SUB600:%.*]] = fsub float [[ADD587]], undef +; CHECK-NEXT: store float [[SUB600]], float* undef, align 4 +; CHECK-NEXT: [[SUB613:%.*]] = fsub float [[ADD587]], [[SUB600]] +; CHECK-NEXT: store float [[SUB613]], float* [[ARRAYIDX26]], align 4 +; CHECK-NEXT: [[ADD626:%.*]] = fadd float [[TMP0]], undef +; CHECK-NEXT: [[SUB639:%.*]] = fsub float [[ADD626]], undef +; CHECK-NEXT: [[SUB652:%.*]] = fsub float [[ADD626]], [[SUB639]] +; CHECK-NEXT: store float [[SUB652]], float* [[ARRAYIDX36]], align 4 +; CHECK-NEXT: br i1 undef, label [[IF_ELSE1609:%.*]], label [[IF_THEN1595:%.*]] +; CHECK: if.then1595: +; CHECK-NEXT: br i1 undef, label [[RETURN:%.*]], label [[FOR_BODY_LR_PH_I_I1702:%.*]] +; CHECK: for.body.lr.ph.i.i1702: +; CHECK-NEXT: unreachable +; CHECK: if.else1609: +; CHECK-NEXT: unreachable +; CHECK: return: +; CHECK-NEXT: ret void +; entry: %arrayidx26 = getelementptr inbounds %class.GIM_TRIANGLE_CALCULATION_CACHE.9.34.69.94.119.144.179.189.264.284.332, %class.GIM_TRIANGLE_CALCULATION_CACHE.9.34.69.94.119.144.179.189.264.284.332* %this, i64 0, i32 2, i64 0, i32 0, i64 1 %arrayidx36 = getelementptr inbounds %class.GIM_TRIANGLE_CALCULATION_CACHE.9.34.69.94.119.144.179.189.264.284.332, %class.GIM_TRIANGLE_CALCULATION_CACHE.9.34.69.94.119.144.179.189.264.284.332* %this, i64 0, i32 2, i64 0, i32 0, i64 2 @@ -71,6 +122,40 @@ return: ; preds = %if.then1595 } define void @_Z8dBoxBox2RK9btVector3PKfS1_S1_S3_S1_RS_PfPiiP12dContactGeomiRN36btDiscreteCollisionDetectorInterface6ResultE() { +; CHECK-LABEL: @_Z8dBoxBox2RK9btVector3PKfS1_S1_S3_S1_RS_PfPiiP12dContactGeomiRN36btDiscreteCollisionDetectorInterface6ResultE( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 undef, label [[RETURN:%.*]], label [[IF_END:%.*]] +; CHECK: if.end: +; CHECK-NEXT: br i1 undef, label [[RETURN]], label [[IF_END111:%.*]] +; CHECK: if.end111: +; CHECK-NEXT: br i1 undef, label [[RETURN]], label [[IF_END136:%.*]] +; CHECK: if.end136: +; CHECK-NEXT: br i1 undef, label [[RETURN]], label [[IF_END162:%.*]] +; CHECK: if.end162: +; CHECK-NEXT: br i1 undef, label [[RETURN]], label [[IF_END189:%.*]] +; CHECK: if.end189: +; CHECK-NEXT: br i1 undef, label [[RETURN]], label [[IF_END216:%.*]] +; CHECK: if.end216: +; CHECK-NEXT: br i1 undef, label [[IF_THEN218:%.*]], label [[IF_END225:%.*]] +; CHECK: if.then218: +; CHECK-NEXT: br label [[IF_END225]] +; CHECK: if.end225: +; CHECK-NEXT: br i1 undef, label [[RETURN]], label [[IF_END248:%.*]] +; CHECK: if.end248: +; CHECK-NEXT: br i1 undef, label [[RETURN]], label [[IF_END304:%.*]] +; CHECK: if.end304: +; CHECK-NEXT: br i1 undef, label [[RETURN]], label [[IF_END361:%.*]] +; CHECK: if.end361: +; CHECK-NEXT: br i1 undef, label [[IF_THEN370:%.*]], label [[IF_END395:%.*]] +; CHECK: if.then370: +; CHECK-NEXT: br i1 undef, label [[IF_THEN374:%.*]], label [[IF_END395]] +; CHECK: if.then374: +; CHECK-NEXT: br label [[IF_END395]] +; CHECK: if.end395: +; CHECK-NEXT: unreachable +; CHECK: return: +; CHECK-NEXT: ret void +; entry: %add8.i2343 = fadd float undef, undef %add8.i2381 = fadd float undef, undef diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet3.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet3.ll index 8102769..7ec4fc1 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet3.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet3.ll @@ -1,4 +1,5 @@ -; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" @@ -7,6 +8,62 @@ target triple = "x86_64-apple-macosx10.8.0" ; Function Attrs: ssp uwtable define void @_ZN11HullLibrary15CleanupVerticesEjPK9btVector3jRjPS0_fRS0_(%class.btVector3.23.221.463.485.507.573.595.683.727.749.815.837.991.1585.1607.1629.1651.1849.2047.2069.2091.2113* %vertices) #0 align 2 { +; CHECK-LABEL: @_ZN11HullLibrary15CleanupVerticesEjPK9btVector3jRjPS0_fRS0_( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 undef, label [[RETURN:%.*]], label [[IF_END:%.*]] +; CHECK: if.end: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: br i1 undef, label [[IF_THEN17_1:%.*]], label [[IF_END22_1:%.*]] +; CHECK: for.end36: +; CHECK-NEXT: br label [[FOR_BODY144:%.*]] +; CHECK: for.body144: +; CHECK-NEXT: br i1 undef, label [[FOR_END227:%.*]], label [[FOR_BODY144]] +; CHECK: for.end227: +; CHECK-NEXT: br i1 undef, label [[FOR_END271:%.*]], label [[FOR_BODY233:%.*]] +; CHECK: for.body233: +; CHECK-NEXT: br i1 undef, label [[FOR_BODY233]], label [[FOR_END271]] +; CHECK: for.end271: +; CHECK-NEXT: [[TMP0:%.*]] = phi float [ 0x47EFFFFFE0000000, [[FOR_END227]] ], [ undef, [[FOR_BODY233]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi float [ 0x47EFFFFFE0000000, [[FOR_END227]] ], [ undef, [[FOR_BODY233]] ] +; CHECK-NEXT: [[SUB275:%.*]] = fsub float undef, [[TMP1]] +; CHECK-NEXT: [[SUB279:%.*]] = fsub float undef, [[TMP0]] +; CHECK-NEXT: br i1 undef, label [[IF_THEN291:%.*]], label [[RETURN]] +; CHECK: if.then291: +; CHECK-NEXT: [[MUL292:%.*]] = fmul float [[SUB275]], 5.000000e-01 +; CHECK-NEXT: [[ADD294:%.*]] = fadd float [[TMP1]], [[MUL292]] +; CHECK-NEXT: [[MUL295:%.*]] = fmul float [[SUB279]], 5.000000e-01 +; CHECK-NEXT: [[ADD297:%.*]] = fadd float [[TMP0]], [[MUL295]] +; CHECK-NEXT: br i1 undef, label [[IF_END332:%.*]], label [[IF_ELSE319:%.*]] +; CHECK: if.else319: +; CHECK-NEXT: br i1 undef, label [[IF_THEN325:%.*]], label [[IF_END327:%.*]] +; CHECK: if.then325: +; CHECK-NEXT: br label [[IF_END327]] +; CHECK: if.end327: +; CHECK-NEXT: br i1 undef, label [[IF_THEN329:%.*]], label [[IF_END332]] +; CHECK: if.then329: +; CHECK-NEXT: br label [[IF_END332]] +; CHECK: if.end332: +; CHECK-NEXT: [[DX272_1:%.*]] = phi float [ [[SUB275]], [[IF_THEN329]] ], [ [[SUB275]], [[IF_END327]] ], [ 0x3F847AE140000000, [[IF_THEN291]] ] +; CHECK-NEXT: [[DY276_1:%.*]] = phi float [ undef, [[IF_THEN329]] ], [ undef, [[IF_END327]] ], [ 0x3F847AE140000000, [[IF_THEN291]] ] +; CHECK-NEXT: [[SUB334:%.*]] = fsub float [[ADD294]], [[DX272_1]] +; CHECK-NEXT: [[SUB338:%.*]] = fsub float [[ADD297]], [[DY276_1]] +; CHECK-NEXT: [[ARRAYIDX_I_I606:%.*]] = getelementptr inbounds [[CLASS_BTVECTOR3_23_221_463_485_507_573_595_683_727_749_815_837_991_1585_1607_1629_1651_1849_2047_2069_2091_2113:%.*]], %class.btVector3.23.221.463.485.507.573.595.683.727.749.815.837.991.1585.1607.1629.1651.1849.2047.2069.2091.2113* [[VERTICES:%.*]], i64 0, i32 0, i64 0 +; CHECK-NEXT: store float [[SUB334]], float* [[ARRAYIDX_I_I606]], align 4 +; CHECK-NEXT: [[ARRAYIDX3_I607:%.*]] = getelementptr inbounds [[CLASS_BTVECTOR3_23_221_463_485_507_573_595_683_727_749_815_837_991_1585_1607_1629_1651_1849_2047_2069_2091_2113]], %class.btVector3.23.221.463.485.507.573.595.683.727.749.815.837.991.1585.1607.1629.1651.1849.2047.2069.2091.2113* [[VERTICES]], i64 0, i32 0, i64 1 +; CHECK-NEXT: store float [[SUB338]], float* [[ARRAYIDX3_I607]], align 4 +; CHECK-NEXT: br label [[RETURN]] +; CHECK: return: +; CHECK-NEXT: ret void +; CHECK: if.then17.1: +; CHECK-NEXT: br label [[IF_END22_1]] +; CHECK: if.end22.1: +; CHECK-NEXT: br i1 undef, label [[IF_THEN17_2:%.*]], label [[IF_END22_2:%.*]] +; CHECK: if.then17.2: +; CHECK-NEXT: br label [[IF_END22_2]] +; CHECK: if.end22.2: +; CHECK-NEXT: br i1 undef, label [[FOR_END36:%.*]], label [[FOR_BODY]] +; entry: br i1 undef, label %return, label %if.end diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll index f10c862..5cee363 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll @@ -1,9 +1,50 @@ -; RUN: opt < %s -basicaa -slp-vectorizer -S +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -basicaa -slp-vectorizer -S | FileCheck %s target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.10.0" define void @testfunc(float* nocapture %dest, float* nocapture readonly %src) { +; CHECK-LABEL: @testfunc( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ACC1_056:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD13:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP23:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[DEST:%.*]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store float [[ACC1_056]], float* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP4]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP5]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = fmul <2 x float> zeroinitializer, [[TMP0]] +; CHECK-NEXT: [[TMP10:%.*]] = fadd <2 x float> [[TMP9]], [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = fcmp olt <2 x float> [[TMP10]], +; CHECK-NEXT: [[TMP12:%.*]] = select <2 x i1> [[TMP11]], <2 x float> [[TMP10]], <2 x float> +; CHECK-NEXT: [[TMP13:%.*]] = fcmp olt <2 x float> [[TMP12]], +; CHECK-NEXT: [[TMP14:%.*]] = fmul <2 x float> zeroinitializer, [[TMP12]] +; CHECK-NEXT: [[TMP15:%.*]] = select <2 x i1> [[TMP13]], <2 x float> , <2 x float> [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP15]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <2 x float> [[TMP15]], i32 1 +; CHECK-NEXT: [[ADD13]] = fadd float [[TMP16]], [[TMP17]] +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <2 x float> undef, float [[TMP17]], i32 0 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x float> [[TMP18]], float [[ADD13]], i32 1 +; CHECK-NEXT: [[TMP20:%.*]] = fcmp olt <2 x float> [[TMP19]], +; CHECK-NEXT: [[TMP21:%.*]] = select <2 x i1> [[TMP20]], <2 x float> [[TMP19]], <2 x float> +; CHECK-NEXT: [[TMP22:%.*]] = fcmp olt <2 x float> [[TMP21]], +; CHECK-NEXT: [[TMP23]] = select <2 x i1> [[TMP22]], <2 x float> , <2 x float> [[TMP21]] +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 32 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; entry: br label %for.body diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_dequeue.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_dequeue.ll index 28b7aa3..b5f736a 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_dequeue.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_dequeue.ll @@ -1,4 +1,5 @@ -; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" @@ -6,6 +7,33 @@ target triple = "x86_64-apple-macosx10.8.0" ; Function Attrs: nounwind ssp uwtable define void @_ZSt6uniqueISt15_Deque_iteratorIdRdPdEET_S4_S4_(%"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* %__first, %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* nocapture %__last) { +; CHECK-LABEL: @_ZSt6uniqueISt15_Deque_iteratorIdRdPdEET_S4_S4_( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[_M_CUR2_I_I:%.*]] = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731", %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* [[__FIRST:%.*]], i64 0, i32 0 +; CHECK-NEXT: [[TMP0:%.*]] = load double*, double** [[_M_CUR2_I_I]], align 8 +; CHECK-NEXT: [[_M_FIRST3_I_I:%.*]] = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731", %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* [[__FIRST]], i64 0, i32 1 +; CHECK-NEXT: [[_M_CUR2_I_I81:%.*]] = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731", %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* [[__LAST:%.*]], i64 0, i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = load double*, double** [[_M_CUR2_I_I81]], align 8 +; CHECK-NEXT: [[_M_FIRST3_I_I83:%.*]] = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731", %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* [[__LAST]], i64 0, i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = load double*, double** [[_M_FIRST3_I_I83]], align 8 +; CHECK-NEXT: br i1 undef, label [[_ZST13ADJACENT_FINDIST15_DEQUE_ITERATORIDRDPDEET_S4_S4__EXIT:%.*]], label [[WHILE_COND_I_PREHEADER:%.*]] +; CHECK: while.cond.i.preheader: +; CHECK-NEXT: br label [[WHILE_COND_I:%.*]] +; CHECK: while.cond.i: +; CHECK-NEXT: br i1 undef, label [[_ZST13ADJACENT_FINDIST15_DEQUE_ITERATORIDRDPDEET_S4_S4__EXIT]], label [[WHILE_BODY_I:%.*]] +; CHECK: while.body.i: +; CHECK-NEXT: br i1 undef, label [[_ZST13ADJACENT_FINDIST15_DEQUE_ITERATORIDRDPDEET_S4_S4__EXIT]], label [[WHILE_COND_I]] +; CHECK: _ZSt13adjacent_findISt15_Deque_iteratorIdRdPdEET_S4_S4_.exit: +; CHECK-NEXT: [[TMP3:%.*]] = phi double* [ [[TMP2]], [[ENTRY:%.*]] ], [ [[TMP2]], [[WHILE_COND_I]] ], [ undef, [[WHILE_BODY_I]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi double* [ [[TMP0]], [[ENTRY]] ], [ [[TMP1]], [[WHILE_COND_I]] ], [ undef, [[WHILE_BODY_I]] ] +; CHECK-NEXT: store double* [[TMP4]], double** [[_M_CUR2_I_I]], align 8 +; CHECK-NEXT: store double* [[TMP3]], double** [[_M_FIRST3_I_I]], align 8 +; CHECK-NEXT: br i1 undef, label [[IF_THEN_I55:%.*]], label [[WHILE_COND:%.*]] +; CHECK: if.then.i55: +; CHECK-NEXT: br label [[WHILE_COND]] +; CHECK: while.cond: +; CHECK-NEXT: br label [[WHILE_COND]] +; entry: %_M_cur2.i.i = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731", %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731"* %__first, i64 0, i32 0 %0 = load double*, double** %_M_cur2.i.i, align 8 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_flop7.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_flop7.ll index e11be48..d149c27 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_flop7.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_flop7.ll @@ -1,10 +1,41 @@ -; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" ; Function Attrs: nounwind ssp uwtable define void @main() #0 { +; CHECK-LABEL: @main( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 undef, label [[WHILE_BODY:%.*]], label [[WHILE_END:%.*]] +; CHECK: while.body: +; CHECK-NEXT: unreachable +; CHECK: while.end: +; CHECK-NEXT: br i1 undef, label [[FOR_END80:%.*]], label [[FOR_BODY75_LR_PH:%.*]] +; CHECK: for.body75.lr.ph: +; CHECK-NEXT: br label [[FOR_BODY75:%.*]] +; CHECK: for.body75: +; CHECK-NEXT: br label [[FOR_BODY75]] +; CHECK: for.end80: +; CHECK-NEXT: br i1 undef, label [[FOR_END300:%.*]], label [[FOR_BODY267_LR_PH:%.*]] +; CHECK: for.body267.lr.ph: +; CHECK-NEXT: br label [[FOR_BODY267:%.*]] +; CHECK: for.body267: +; CHECK-NEXT: [[S_71010:%.*]] = phi double [ 0.000000e+00, [[FOR_BODY267_LR_PH]] ], [ [[ADD297:%.*]], [[FOR_BODY267]] ] +; CHECK-NEXT: [[MUL269:%.*]] = fmul double undef, undef +; CHECK-NEXT: [[MUL270:%.*]] = fmul double [[MUL269]], [[MUL269]] +; CHECK-NEXT: [[ADD282:%.*]] = fadd double undef, undef +; CHECK-NEXT: [[MUL283:%.*]] = fmul double [[MUL269]], [[ADD282]] +; CHECK-NEXT: [[ADD293:%.*]] = fadd double undef, undef +; CHECK-NEXT: [[MUL294:%.*]] = fmul double [[MUL270]], [[ADD293]] +; CHECK-NEXT: [[ADD295:%.*]] = fadd double undef, [[MUL294]] +; CHECK-NEXT: [[DIV296:%.*]] = fdiv double [[MUL283]], [[ADD295]] +; CHECK-NEXT: [[ADD297]] = fadd double [[S_71010]], [[DIV296]] +; CHECK-NEXT: br i1 undef, label [[FOR_BODY267]], label [[FOR_END300]] +; CHECK: for.end300: +; CHECK-NEXT: unreachable +; entry: br i1 undef, label %while.body, label %while.end diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_gep.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_gep.ll index bd1e8f7..eca21e7 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_gep.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_gep.ll @@ -1,4 +1,5 @@ -; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-unknown-linux-gnu +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @@ -7,6 +8,17 @@ target triple = "x86_64-unknown-linux-gnu" ; Function Attrs: nounwind uwtable define i32 @fn1() { +; CHECK-LABEL: @fn1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i64*, i64** @a, align 8 +; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint i64* [[ADD_PTR]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[TMP0]], i64 2 +; CHECK-NEXT: store i64 [[TMP1]], i64* [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint i64* [[ARRAYIDX]] to i64 +; CHECK-NEXT: store i64 [[TMP2]], i64* [[ADD_PTR]], align 8 +; CHECK-NEXT: ret i32 undef +; entry: %0 = load i64*, i64** @a, align 8 %add.ptr = getelementptr inbounds i64, i64* %0, i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_lencod.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_lencod.ll index 70b13fd..eb1cb32 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_lencod.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_lencod.ll @@ -1,10 +1,45 @@ -; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" ; Function Attrs: nounwind ssp uwtable define void @RCModelEstimator() { +; CHECK-LABEL: @RCModelEstimator( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 undef, label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END_THREAD:%.*]] +; CHECK: for.end.thread: +; CHECK-NEXT: unreachable +; CHECK: for.body.lr.ph: +; CHECK-NEXT: br i1 undef, label [[FOR_END:%.*]], label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]] +; CHECK: for.end: +; CHECK-NEXT: br i1 undef, label [[FOR_BODY3:%.*]], label [[IF_END103:%.*]] +; CHECK: for.cond14.preheader: +; CHECK-NEXT: br i1 undef, label [[FOR_BODY16_LR_PH:%.*]], label [[IF_END103]] +; CHECK: for.body16.lr.ph: +; CHECK-NEXT: br label [[FOR_BODY16:%.*]] +; CHECK: for.body3: +; CHECK-NEXT: br i1 undef, label [[IF_THEN7:%.*]], label [[FOR_INC11:%.*]] +; CHECK: if.then7: +; CHECK-NEXT: br label [[FOR_INC11]] +; CHECK: for.inc11: +; CHECK-NEXT: br i1 false, label [[FOR_COND14_PREHEADER:%.*]], label [[FOR_BODY3]] +; CHECK: for.body16: +; CHECK-NEXT: br i1 undef, label [[FOR_END39:%.*]], label [[FOR_BODY16]] +; CHECK: for.end39: +; CHECK-NEXT: br i1 undef, label [[IF_END103]], label [[FOR_COND45_PREHEADER:%.*]] +; CHECK: for.cond45.preheader: +; CHECK-NEXT: br i1 undef, label [[IF_THEN88:%.*]], label [[IF_ELSE:%.*]] +; CHECK: if.then88: +; CHECK-NEXT: br label [[IF_END103]] +; CHECK: if.else: +; CHECK-NEXT: br label [[IF_END103]] +; CHECK: if.end103: +; CHECK-NEXT: ret void +; entry: br i1 undef, label %for.body.lr.ph, label %for.end.thread @@ -66,6 +101,17 @@ if.end103: ; preds = %if.else, %if.then88 define void @intrapred_luma() { +; CHECK-LABEL: @intrapred_luma( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CONV153:%.*]] = trunc i32 undef to i16 +; CHECK-NEXT: [[ARRAYIDX154:%.*]] = getelementptr inbounds [13 x i16], [13 x i16]* undef, i64 0, i64 12 +; CHECK-NEXT: store i16 [[CONV153]], i16* [[ARRAYIDX154]], align 8 +; CHECK-NEXT: [[ARRAYIDX155:%.*]] = getelementptr inbounds [13 x i16], [13 x i16]* undef, i64 0, i64 11 +; CHECK-NEXT: store i16 [[CONV153]], i16* [[ARRAYIDX155]], align 2 +; CHECK-NEXT: [[ARRAYIDX156:%.*]] = getelementptr inbounds [13 x i16], [13 x i16]* undef, i64 0, i64 10 +; CHECK-NEXT: store i16 [[CONV153]], i16* [[ARRAYIDX156]], align 4 +; CHECK-NEXT: ret void +; entry: %conv153 = trunc i32 undef to i16 %arrayidx154 = getelementptr inbounds [13 x i16], [13 x i16]* undef, i64 0, i64 12 @@ -78,6 +124,18 @@ entry: } define fastcc void @dct36(double* %inbuf) { +; CHECK-LABEL: @dct36( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds double, double* [[INBUF:%.*]], i64 2 +; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds double, double* [[INBUF]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = load double, double* [[ARRAYIDX44]], align 8 +; CHECK-NEXT: [[ADD46:%.*]] = fadd double [[TMP0]], undef +; CHECK-NEXT: store double [[ADD46]], double* [[ARRAYIDX41]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load double, double* [[INBUF]], align 8 +; CHECK-NEXT: [[ADD49:%.*]] = fadd double [[TMP1]], [[TMP0]] +; CHECK-NEXT: store double [[ADD49]], double* [[ARRAYIDX44]], align 8 +; CHECK-NEXT: ret void +; entry: %arrayidx41 = getelementptr inbounds double, double* %inbuf, i64 2 %arrayidx44 = getelementptr inbounds double, double* %inbuf, i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll index f82343f..f12de2a 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_mandeltext.ll @@ -1,9 +1,46 @@ -; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" define void @main() { +; CHECK-LABEL: @main( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: br label [[FOR_COND4_PREHEADER:%.*]] +; CHECK: for.cond4.preheader: +; CHECK-NEXT: br label [[FOR_BODY6:%.*]] +; CHECK: for.body6: +; CHECK-NEXT: br label [[FOR_BODY12:%.*]] +; CHECK: for.body12: +; CHECK-NEXT: [[FZIMG_069:%.*]] = phi double [ undef, [[FOR_BODY6]] ], [ [[ADD19:%.*]], [[IF_END:%.*]] ] +; CHECK-NEXT: [[FZREAL_068:%.*]] = phi double [ undef, [[FOR_BODY6]] ], [ [[ADD20:%.*]], [[IF_END]] ] +; CHECK-NEXT: [[MUL13:%.*]] = fmul double [[FZREAL_068]], [[FZREAL_068]] +; CHECK-NEXT: [[MUL14:%.*]] = fmul double [[FZIMG_069]], [[FZIMG_069]] +; CHECK-NEXT: [[ADD15:%.*]] = fadd double [[MUL13]], [[MUL14]] +; CHECK-NEXT: [[CMP16:%.*]] = fcmp ogt double [[ADD15]], 4.000000e+00 +; CHECK-NEXT: br i1 [[CMP16]], label [[FOR_INC21:%.*]], label [[IF_END]] +; CHECK: if.end: +; CHECK-NEXT: [[MUL18:%.*]] = fmul double undef, [[FZIMG_069]] +; CHECK-NEXT: [[ADD19]] = fadd double undef, [[MUL18]] +; CHECK-NEXT: [[SUB:%.*]] = fsub double [[MUL13]], [[MUL14]] +; CHECK-NEXT: [[ADD20]] = fadd double undef, [[SUB]] +; CHECK-NEXT: br i1 undef, label [[FOR_BODY12]], label [[FOR_INC21]] +; CHECK: for.inc21: +; CHECK-NEXT: br i1 undef, label [[FOR_END23:%.*]], label [[FOR_BODY6]] +; CHECK: for.end23: +; CHECK-NEXT: br i1 undef, label [[IF_THEN25:%.*]], label [[IF_THEN26:%.*]] +; CHECK: if.then25: +; CHECK-NEXT: br i1 undef, label [[FOR_END44:%.*]], label [[FOR_COND4_PREHEADER]] +; CHECK: if.then26: +; CHECK-NEXT: unreachable +; CHECK: for.end44: +; CHECK-NEXT: br i1 undef, label [[FOR_END48:%.*]], label [[FOR_BODY]] +; CHECK: for.end48: +; CHECK-NEXT: ret void +; entry: br label %for.body @@ -54,6 +91,26 @@ for.end48: ; preds = %for.end44 %struct.hoge = type { double, double, double} define void @zot(%struct.hoge* %arg) { +; CHECK-LABEL: @zot( +; CHECK-NEXT: bb: +; CHECK-NEXT: [[TMP:%.*]] = load double, double* undef, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load double, double* undef, align 8 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[TMP]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = fsub <2 x double> [[TMP1]], undef +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_HOGE:%.*]], %struct.hoge* [[ARG:%.*]], i64 0, i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> undef, [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = fsub <2 x double> [[TMP3]], undef +; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP7]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 8 +; CHECK-NEXT: br i1 undef, label [[BB11:%.*]], label [[BB12:%.*]] +; CHECK: bb11: +; CHECK-NEXT: br label [[BB14:%.*]] +; CHECK: bb12: +; CHECK-NEXT: br label [[BB14]] +; CHECK: bb14: +; CHECK-NEXT: ret void +; bb: %tmp = load double, double* undef, align 8 %tmp1 = fsub double %tmp, undef @@ -85,6 +142,22 @@ bb14: ; preds = %bb12, %bb11 %struct.rc4_state.0.24 = type { i32, i32, [256 x i32] } define void @rc4_crypt(%struct.rc4_state.0.24* nocapture %s) { +; CHECK-LABEL: @rc4_crypt( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[X1:%.*]] = getelementptr inbounds [[STRUCT_RC4_STATE_0_24:%.*]], %struct.rc4_state.0.24* [[S:%.*]], i64 0, i32 0 +; CHECK-NEXT: [[Y2:%.*]] = getelementptr inbounds [[STRUCT_RC4_STATE_0_24]], %struct.rc4_state.0.24* [[S]], i64 0, i32 1 +; CHECK-NEXT: br i1 undef, label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[CONV4:%.*]] = and i32 undef, 255 +; CHECK-NEXT: [[CONV7:%.*]] = and i32 undef, 255 +; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]] +; CHECK: for.end: +; CHECK-NEXT: [[X_0_LCSSA:%.*]] = phi i32 [ undef, [[ENTRY:%.*]] ], [ [[CONV4]], [[FOR_BODY]] ] +; CHECK-NEXT: [[Y_0_LCSSA:%.*]] = phi i32 [ undef, [[ENTRY]] ], [ [[CONV7]], [[FOR_BODY]] ] +; CHECK-NEXT: store i32 [[X_0_LCSSA]], i32* [[X1]], align 4 +; CHECK-NEXT: store i32 [[Y_0_LCSSA]], i32* [[Y2]], align 4 +; CHECK-NEXT: ret void +; entry: %x1 = getelementptr inbounds %struct.rc4_state.0.24, %struct.rc4_state.0.24* %s, i64 0, i32 0 %y2 = getelementptr inbounds %struct.rc4_state.0.24, %struct.rc4_state.0.24* %s, i64 0, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_netbsd_decompress.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_netbsd_decompress.ll index e1df98d..a52ec6b 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_netbsd_decompress.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_netbsd_decompress.ll @@ -1,4 +1,5 @@ -; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" @@ -12,6 +13,30 @@ target triple = "x86_64-apple-macosx10.8.0" @e = common global i32 0, align 4 define i32 @fn1() { +; CHECK-LABEL: @fn1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* getelementptr inbounds (%struct.DState, %struct.DState* @b, i32 0, i32 0), align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* getelementptr inbounds (%struct.DState, %struct.DState* @b, i32 0, i32 1), align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* @d, align 4 +; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[COND]], label [[SW_BB:%.*]], label [[SAVE_STATE_AND_RETURN:%.*]] +; CHECK: sw.bb: +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* @c, align 4 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[TMP3]], 7 +; CHECK-NEXT: store i32 [[AND]], i32* @a, align 4 +; CHECK-NEXT: switch i32 [[AND]], label [[IF_END:%.*]] [ +; CHECK-NEXT: i32 7, label [[SAVE_STATE_AND_RETURN]] +; CHECK-NEXT: i32 0, label [[SAVE_STATE_AND_RETURN]] +; CHECK-NEXT: ] +; CHECK: if.end: +; CHECK-NEXT: br label [[SAVE_STATE_AND_RETURN]] +; CHECK: save_state_and_return: +; CHECK-NEXT: [[T_0:%.*]] = phi i32 [ 0, [[IF_END]] ], [ [[TMP0]], [[ENTRY:%.*]] ], [ [[TMP0]], [[SW_BB]] ], [ [[TMP0]], [[SW_BB]] ] +; CHECK-NEXT: [[F_0:%.*]] = phi i32 [ 0, [[IF_END]] ], [ [[TMP1]], [[ENTRY]] ], [ 0, [[SW_BB]] ], [ 0, [[SW_BB]] ] +; CHECK-NEXT: store i32 [[T_0]], i32* getelementptr inbounds (%struct.DState, %struct.DState* @b, i32 0, i32 0), align 4 +; CHECK-NEXT: store i32 [[F_0]], i32* getelementptr inbounds (%struct.DState, %struct.DState* @b, i32 0, i32 1), align 4 +; CHECK-NEXT: ret i32 undef +; entry: %0 = load i32, i32* getelementptr inbounds (%struct.DState, %struct.DState* @b, i32 0, i32 0), align 4 %1 = load i32, i32* getelementptr inbounds (%struct.DState, %struct.DState* @b, i32 0, i32 1), align 4 @@ -24,8 +49,8 @@ sw.bb: ; preds = %entry %and = and i32 %3, 7 store i32 %and, i32* @a, align 4 switch i32 %and, label %if.end [ - i32 7, label %save_state_and_return - i32 0, label %save_state_and_return + i32 7, label %save_state_and_return + i32 0, label %save_state_and_return ] if.end: ; preds = %sw.bb diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_scheduling.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_scheduling.ll index 916772c..9108e84 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_scheduling.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_scheduling.ll @@ -1,9 +1,41 @@ -; RUN: opt < %s -basicaa -disable-verify -slp-vectorizer -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -basicaa -disable-verify -slp-vectorizer -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 | FileCheck %s target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-apple-darwin13.3.0" define void @_foo(double %p1, double %p2, double %p3) #0 { +; CHECK-LABEL: @_foo( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TAB1:%.*]] = alloca [256 x i32], align 16 +; CHECK-NEXT: [[TAB2:%.*]] = alloca [256 x i32], align 16 +; CHECK-NEXT: br label [[BB1:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[MUL19:%.*]] = fmul double [[P1:%.*]], 1.638400e+04 +; CHECK-NEXT: [[MUL20:%.*]] = fmul double [[P3:%.*]], 1.638400e+04 +; CHECK-NEXT: [[ADD:%.*]] = fadd double [[MUL20]], 8.192000e+03 +; CHECK-NEXT: [[MUL21:%.*]] = fmul double [[P2:%.*]], 1.638400e+04 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV266:%.*]] = phi i64 [ 0, [[BB1]] ], [ [[INDVARS_IV_NEXT267:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[T_0259:%.*]] = phi double [ 0.000000e+00, [[BB1]] ], [ [[ADD27:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[P3_ADDR_0258:%.*]] = phi double [ [[ADD]], [[BB1]] ], [ [[ADD28:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[VECINIT_I_I237:%.*]] = insertelement <2 x double> undef, double [[T_0259]], i32 0 +; CHECK-NEXT: [[X13:%.*]] = tail call i32 @_xfn(<2 x double> [[VECINIT_I_I237]]) +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [256 x i32], [256 x i32]* [[TAB1]], i64 0, i64 [[INDVARS_IV266]] +; CHECK-NEXT: store i32 [[X13]], i32* [[ARRAYIDX]], align 4, !tbaa !0 +; CHECK-NEXT: [[VECINIT_I_I:%.*]] = insertelement <2 x double> undef, double [[P3_ADDR_0258]], i32 0 +; CHECK-NEXT: [[X14:%.*]] = tail call i32 @_xfn(<2 x double> [[VECINIT_I_I]]) +; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds [256 x i32], [256 x i32]* [[TAB2]], i64 0, i64 [[INDVARS_IV266]] +; CHECK-NEXT: store i32 [[X14]], i32* [[ARRAYIDX26]], align 4, !tbaa !0 +; CHECK-NEXT: [[ADD27]] = fadd double [[MUL19]], [[T_0259]] +; CHECK-NEXT: [[ADD28]] = fadd double [[MUL21]], [[P3_ADDR_0258]] +; CHECK-NEXT: [[INDVARS_IV_NEXT267]] = add nuw nsw i64 [[INDVARS_IV266]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT267]], 256 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[RETURN:%.*]], label [[FOR_BODY]] +; CHECK: return: +; CHECK-NEXT: ret void +; entry: %tab1 = alloca [256 x i32], align 16 %tab2 = alloca [256 x i32], align 16 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_sim4b1.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_sim4b1.ll index 5a576c2..9a07cfc 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_sim4b1.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_sim4b1.ll @@ -1,4 +1,5 @@ -; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" @@ -6,6 +7,83 @@ target triple = "x86_64-apple-macosx10.8.0" %struct._exon_t.12.103.220.363.480.649.740.857.1039.1065.1078.1091.1117.1130.1156.1169.1195.1221.1234.1286.1299.1312.1338.1429.1455.1468.1494.1520.1884.1897.1975.2066.2105.2170.2171 = type { i32, i32, i32, i32, i32, i32, [8 x i8] } define void @SIM4() { +; CHECK-LABEL: @SIM4( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 undef, label [[RETURN:%.*]], label [[LOR_LHS_FALSE:%.*]] +; CHECK: lor.lhs.false: +; CHECK-NEXT: br i1 undef, label [[RETURN]], label [[IF_END:%.*]] +; CHECK: if.end: +; CHECK-NEXT: br i1 undef, label [[FOR_END605:%.*]], label [[FOR_BODY_LR_PH:%.*]] +; CHECK: for.body.lr.ph: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: br i1 undef, label [[FOR_INC603:%.*]], label [[IF_END12:%.*]] +; CHECK: if.end12: +; CHECK-NEXT: br i1 undef, label [[LAND_LHS_TRUE:%.*]], label [[LAND_LHS_TRUE167:%.*]] +; CHECK: land.lhs.true: +; CHECK-NEXT: br i1 undef, label [[IF_THEN17:%.*]], label [[LAND_LHS_TRUE167]] +; CHECK: if.then17: +; CHECK-NEXT: br i1 undef, label [[IF_END98:%.*]], label [[LAND_RHS_LR_PH:%.*]] +; CHECK: land.rhs.lr.ph: +; CHECK-NEXT: unreachable +; CHECK: if.end98: +; CHECK-NEXT: [[FROM299:%.*]] = getelementptr inbounds [[STRUCT__EXON_T_12_103_220_363_480_649_740_857_1039_1065_1078_1091_1117_1130_1156_1169_1195_1221_1234_1286_1299_1312_1338_1429_1455_1468_1494_1520_1884_1897_1975_2066_2105_2170_2171:%.*]], %struct._exon_t.12.103.220.363.480.649.740.857.1039.1065.1078.1091.1117.1130.1156.1169.1195.1221.1234.1286.1299.1312.1338.1429.1455.1468.1494.1520.1884.1897.1975.2066.2105.2170.2171* undef, i64 0, i32 1 +; CHECK-NEXT: br i1 undef, label [[LAND_LHS_TRUE167]], label [[IF_THEN103:%.*]] +; CHECK: if.then103: +; CHECK-NEXT: [[DOTSUB100:%.*]] = select i1 undef, i32 250, i32 undef +; CHECK-NEXT: [[MUL114:%.*]] = shl nsw i32 [[DOTSUB100]], 2 +; CHECK-NEXT: [[FROM1115:%.*]] = getelementptr inbounds [[STRUCT__EXON_T_12_103_220_363_480_649_740_857_1039_1065_1078_1091_1117_1130_1156_1169_1195_1221_1234_1286_1299_1312_1338_1429_1455_1468_1494_1520_1884_1897_1975_2066_2105_2170_2171]], %struct._exon_t.12.103.220.363.480.649.740.857.1039.1065.1078.1091.1117.1130.1156.1169.1195.1221.1234.1286.1299.1312.1338.1429.1455.1468.1494.1520.1884.1897.1975.2066.2105.2170.2171* undef, i64 0, i32 0 +; CHECK-NEXT: [[COND125:%.*]] = select i1 undef, i32 undef, i32 [[MUL114]] +; CHECK-NEXT: br label [[FOR_COND_I:%.*]] +; CHECK: for.cond.i: +; CHECK-NEXT: [[ROW_0_I:%.*]] = phi i32 [ undef, [[LAND_RHS_I874:%.*]] ], [ [[DOTSUB100]], [[IF_THEN103]] ] +; CHECK-NEXT: [[COL_0_I:%.*]] = phi i32 [ undef, [[LAND_RHS_I874]] ], [ [[COND125]], [[IF_THEN103]] ] +; CHECK-NEXT: br i1 undef, label [[LAND_RHS_I874]], label [[FOR_END_I:%.*]] +; CHECK: land.rhs.i874: +; CHECK-NEXT: br i1 undef, label [[FOR_COND_I]], label [[FOR_END_I]] +; CHECK: for.end.i: +; CHECK-NEXT: br i1 undef, label [[IF_THEN_I:%.*]], label [[IF_END_I:%.*]] +; CHECK: if.then.i: +; CHECK-NEXT: [[ADD14_I:%.*]] = add nsw i32 [[ROW_0_I]], undef +; CHECK-NEXT: [[ADD15_I:%.*]] = add nsw i32 [[COL_0_I]], undef +; CHECK-NEXT: br label [[EXTEND_BW_EXIT:%.*]] +; CHECK: if.end.i: +; CHECK-NEXT: [[ADD16_I:%.*]] = add i32 [[COND125]], [[DOTSUB100]] +; CHECK-NEXT: [[CMP26514_I:%.*]] = icmp slt i32 [[ADD16_I]], 0 +; CHECK-NEXT: br i1 [[CMP26514_I]], label [[FOR_END33_I:%.*]], label [[FOR_BODY28_LR_PH_I:%.*]] +; CHECK: for.body28.lr.ph.i: +; CHECK-NEXT: br label [[FOR_END33_I]] +; CHECK: for.end33.i: +; CHECK-NEXT: br i1 undef, label [[FOR_END58_I:%.*]], label [[FOR_BODY52_LR_PH_I:%.*]] +; CHECK: for.body52.lr.ph.i: +; CHECK-NEXT: br label [[FOR_END58_I]] +; CHECK: for.end58.i: +; CHECK-NEXT: br label [[WHILE_COND260_I:%.*]] +; CHECK: while.cond260.i: +; CHECK-NEXT: br i1 undef, label [[LAND_RHS263_I:%.*]], label [[WHILE_END275_I:%.*]] +; CHECK: land.rhs263.i: +; CHECK-NEXT: br i1 undef, label [[WHILE_COND260_I]], label [[WHILE_END275_I]] +; CHECK: while.end275.i: +; CHECK-NEXT: br label [[EXTEND_BW_EXIT]] +; CHECK: extend_bw.exit: +; CHECK-NEXT: [[ADD14_I1262:%.*]] = phi i32 [ [[ADD14_I]], [[IF_THEN_I]] ], [ undef, [[WHILE_END275_I]] ] +; CHECK-NEXT: [[ADD15_I1261:%.*]] = phi i32 [ [[ADD15_I]], [[IF_THEN_I]] ], [ undef, [[WHILE_END275_I]] ] +; CHECK-NEXT: br i1 false, label [[IF_THEN157:%.*]], label [[LAND_LHS_TRUE167]] +; CHECK: if.then157: +; CHECK-NEXT: [[ADD158:%.*]] = add nsw i32 [[ADD14_I1262]], 1 +; CHECK-NEXT: store i32 [[ADD158]], i32* [[FROM299]], align 4 +; CHECK-NEXT: [[ADD160:%.*]] = add nsw i32 [[ADD15_I1261]], 1 +; CHECK-NEXT: store i32 [[ADD160]], i32* [[FROM1115]], align 4 +; CHECK-NEXT: br label [[LAND_LHS_TRUE167]] +; CHECK: land.lhs.true167: +; CHECK-NEXT: unreachable +; CHECK: for.inc603: +; CHECK-NEXT: br i1 undef, label [[FOR_BODY]], label [[FOR_END605]] +; CHECK: for.end605: +; CHECK-NEXT: unreachable +; CHECK: return: +; CHECK-NEXT: ret void +; entry: br i1 undef, label %return, label %lor.lhs.false diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll index 273584c..e2d3637 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll @@ -1,4 +1,5 @@ -; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 | FileCheck %s target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" @@ -8,6 +9,45 @@ target triple = "x86_64-apple-macosx10.8.0" ; Function Attrs: ssp uwtable define void @main() #0 { +; CHECK-LABEL: @main( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 undef, label [[COND_TRUE:%.*]], label [[COND_END:%.*]] +; CHECK: cond.true: +; CHECK-NEXT: unreachable +; CHECK: cond.end: +; CHECK-NEXT: br label [[INVOKE_CONT:%.*]] +; CHECK: invoke.cont: +; CHECK-NEXT: br i1 undef, label [[ARRAYCTOR_CONT:%.*]], label [[INVOKE_CONT]] +; CHECK: arrayctor.cont: +; CHECK-NEXT: [[AGG_TMP99208_SROA_0_0_IDX:%.*]] = getelementptr inbounds [[STRUCT_RAY_5_11_53_113_119_137_149_185_329_389_416:%.*]], %struct.Ray.5.11.53.113.119.137.149.185.329.389.416* undef, i64 0, i32 0, i32 0 +; CHECK-NEXT: [[AGG_TMP101211_SROA_0_0_IDX:%.*]] = getelementptr inbounds [[STRUCT_RAY_5_11_53_113_119_137_149_185_329_389_416]], %struct.Ray.5.11.53.113.119.137.149.185.329.389.416* undef, i64 0, i32 1, i32 0 +; CHECK-NEXT: br label [[FOR_COND36_PREHEADER:%.*]] +; CHECK: for.cond36.preheader: +; CHECK-NEXT: br i1 undef, label [[FOR_BODY42_LR_PH_US:%.*]], label [[_Z5CLAMPD_EXIT_1:%.*]] +; CHECK: cond.false51.us: +; CHECK-NEXT: unreachable +; CHECK: cond.true48.us: +; CHECK-NEXT: br i1 undef, label [[COND_TRUE63_US:%.*]], label [[COND_FALSE66_US:%.*]] +; CHECK: cond.false66.us: +; CHECK-NEXT: [[ADD_I276_US:%.*]] = fadd double 0.000000e+00, undef +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> undef, double [[ADD_I276_US]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double 0xBFA5CC2D1960285F, i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x double> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> , [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> , [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> undef, [[TMP2]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[AGG_TMP99208_SROA_0_0_IDX]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP6]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[AGG_TMP101211_SROA_0_0_IDX]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP5]], <2 x double>* [[TMP7]], align 8 +; CHECK-NEXT: unreachable +; CHECK: cond.true63.us: +; CHECK-NEXT: unreachable +; CHECK: for.body42.lr.ph.us: +; CHECK-NEXT: br i1 undef, label [[COND_TRUE48_US:%.*]], label [[COND_FALSE51_US:%.*]] +; CHECK: _Z5clampd.exit.1: +; CHECK-NEXT: br label [[FOR_COND36_PREHEADER]] +; entry: br i1 undef, label %cond.true, label %cond.end @@ -67,6 +107,27 @@ _Z5clampd.exit.1: ; preds = %for.cond36.preheade %struct.Vec.0.6.48.90.132.186.192.198.234.252.258.264.270.276.282.288.378.432.438.450.456.594.600 = type { double, double, double } define void @_Z8radianceRK3RayiPt() #0 { +; CHECK-LABEL: @_Z8radianceRK3RayiPt( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 undef, label [[IF_THEN78:%.*]], label [[IF_THEN38:%.*]] +; CHECK: if.then38: +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> undef, double undef, i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = fmul <2 x double> undef, [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = fsub <2 x double> undef, [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> undef, [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> undef, [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> undef, [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> undef, [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = fmul <2 x double> undef, [[TMP6]] +; CHECK-NEXT: [[AGG_TMP74663_SROA_0_0_IDX:%.*]] = getelementptr inbounds [[STRUCT_RAY_5_11_53_95_137_191_197_203_239_257_263_269_275_281_287_293_383_437_443_455_461_599_601:%.*]], %struct.Ray.5.11.53.95.137.191.197.203.239.257.263.269.275.281.287.293.383.437.443.455.461.599.601* undef, i64 0, i32 1, i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast double* [[AGG_TMP74663_SROA_0_0_IDX]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP7]], <2 x double>* [[TMP8]], align 8 +; CHECK-NEXT: br label [[RETURN:%.*]] +; CHECK: if.then78: +; CHECK-NEXT: br label [[RETURN]] +; CHECK: return: +; CHECK-NEXT: ret void +; entry: br i1 undef, label %if.then78, label %if.then38 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_vectorizeTree.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_vectorizeTree.ll index 45ca99a..24f5bad 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_vectorizeTree.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_vectorizeTree.ll @@ -1,14 +1,14 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -basicaa -slp-vectorizer -mtriple=x86_64-apple-macosx10.9.0 -mcpu=corei7-avx -S < %s | FileCheck %s target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.9.0" ; This test used to crash because we were following phi chains incorrectly. -; We used indices to get the incoming value of two phi nodes rather than +; We used indices to get the incoming value of two phi nodes rather than ; incoming block lookup. ; This can give wrong results when the ordering of incoming ; edges in the two phi nodes don't match. -;CHECK-LABEL: bar %0 = type { %1, %2 } %1 = type { double, double } @@ -17,6 +17,36 @@ target triple = "x86_64-apple-macosx10.9.0" ;define fastcc void @bar() { define void @bar() { +; CHECK-LABEL: @bar( +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[TMP0:%.*]], %0* undef, i64 0, i32 1, i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[TMP0]], %0* undef, i64 0, i32 1, i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[TMP0]], %0* undef, i64 0, i32 1, i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[TMP0]], %0* undef, i64 0, i32 1, i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[TMP0]], %0* undef, i64 0, i32 1, i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[TMP0]], %0* undef, i64 0, i32 1, i32 1 +; CHECK-NEXT: br label [[TMP7:%.*]] +; CHECK: [[TMP8:%.*]] = phi <2 x double> [ , [[TMP0]] ], [ [[TMP11:%.*]], [[TMP21:%.*]] ], [ [[TMP11]], [[TMP18:%.*]] ], [ [[TMP11]], [[TMP18]] ] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast double* [[TMP1]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP8]], <2 x double>* [[TMP9]], align 8 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast double* [[TMP3]] to <2 x double>* +; CHECK-NEXT: [[TMP11]] = load <2 x double>, <2 x double>* [[TMP10]], align 8 +; CHECK-NEXT: br i1 undef, label [[TMP12:%.*]], label [[TMP13:%.*]] +; CHECK: ret void +; CHECK: [[TMP14:%.*]] = bitcast double* [[TMP5]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP11]], <2 x double>* [[TMP14]], align 8 +; CHECK-NEXT: br i1 undef, label [[TMP15:%.*]], label [[TMP16:%.*]] +; CHECK: br label [[TMP16]] +; CHECK: br i1 undef, label [[TMP17:%.*]], label [[TMP18]] +; CHECK: unreachable +; CHECK: [[TMP19:%.*]] = extractelement <2 x double> [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x double> [[TMP11]], i32 1 +; CHECK-NEXT: switch i32 undef, label [[TMP21]] [ +; CHECK-NEXT: i32 32, label [[TMP7]] +; CHECK-NEXT: i32 103, label [[TMP7]] +; CHECK-NEXT: ] +; CHECK: br i1 undef, label [[TMP7]], label [[TMP22:%.*]] +; CHECK: unreachable +; %1 = getelementptr inbounds %0, %0* undef, i64 0, i32 1, i32 0 %2 = getelementptr inbounds %0, %0* undef, i64 0, i32 1, i32 1 %3 = getelementptr inbounds %0, %0* undef, i64 0, i32 1, i32 0 @@ -53,8 +83,8 @@ define void @bar() { ;