From 0ad87ffdcc2379f1156651d1953a162379e2de8c Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Wed, 15 Mar 2023 12:33:00 -0700 Subject: [PATCH] [SLP]Introduce shuffle of the nodes + gather/vectorbuild of the remaining scalars. Currently compiler does not support mixing of shuffled nodes + gather/buildvector of the remaining scalar values. It may reduce total number of instructions and improve performance of the gather/buildvector sequences. Part of D110978 Differential Revision: https://reviews.llvm.org/D146167 --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 179 +++++++++++++++-- llvm/test/Transforms/SLPVectorizer/X86/c-ray.ll | 22 +-- .../Transforms/SLPVectorizer/X86/commutativity.ll | 9 +- .../SLPVectorizer/X86/crash_clear_undefs.ll | 28 ++- .../SLPVectorizer/X86/crash_netbsd_decompress.ll | 2 +- .../Transforms/SLPVectorizer/X86/crash_smallpt.ll | 10 +- llvm/test/Transforms/SLPVectorizer/X86/cse.ll | 12 +- .../SLPVectorizer/X86/jumbled-load-multiuse.ll | 9 +- .../test/Transforms/SLPVectorizer/X86/lookahead.ll | 5 +- .../SLPVectorizer/X86/matched-shuffled-entries.ll | 31 ++- .../SLPVectorizer/X86/memory-runtime-checks.ll | 12 +- .../Transforms/SLPVectorizer/X86/operandorder.ll | 220 ++++++++++----------- llvm/test/Transforms/SLPVectorizer/X86/phi.ll | 69 ++++--- llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll | 30 ++- .../SLPVectorizer/X86/reduction-logical.ll | 17 +- .../Transforms/SLPVectorizer/X86/reduction2.ll | 4 +- .../SLPVectorizer/X86/reorder-clustered-node.ll | 21 +- .../X86/reorder-reused-masked-gather.ll | 2 +- .../SLPVectorizer/X86/root-trunc-extract-reuse.ll | 5 +- .../SLPVectorizer/X86/scatter-vectorize-reorder.ll | 14 +- 20 files changed, 406 insertions(+), 295 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 7bc9a4f..9da2fd4 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2489,8 +2489,9 @@ private: /// the bundle void setInsertPointAfterBundle(const TreeEntry *E); - /// \returns a vector from a collection of scalars in \p VL. - Value *gather(ArrayRef VL); + /// \returns a vector from a collection of scalars in \p VL. if \p Root is not + /// specified, the starting vector value is poison. + Value *gather(ArrayRef VL, Value *Root = nullptr); /// \returns whether the VectorizableTree is fully vectorizable and will /// be beneficial even the tree height is tiny. @@ -6596,9 +6597,10 @@ protected: LocalVF = SVOpTy->getNumElements(); SmallVector ExtMask(Mask.size(), UndefMaskElem); for (auto [Idx, I] : enumerate(Mask)) { - if (I == UndefMaskElem) - continue; - ExtMask[Idx] = SV->getMaskValue(I); + if (I == UndefMaskElem || + static_cast(I) >= SV->getShuffleMask().size()) + continue; + ExtMask[Idx] = SV->getMaskValue(I); } bool IsOp1Undef = isUndefVector(SV->getOperand(0), @@ -8922,7 +8924,7 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) { Builder.SetCurrentDebugLocation(Front->getDebugLoc()); } -Value *BoUpSLP::gather(ArrayRef VL) { +Value *BoUpSLP::gather(ArrayRef VL, Value *Root) { // List of instructions/lanes from current block and/or the blocks which are // part of the current loop. These instructions will be inserted at the end to // make it possible to optimize loops and hoist invariant instructions out of @@ -8939,7 +8941,8 @@ Value *BoUpSLP::gather(ArrayRef VL) { for (int I = 0, E = VL.size(); I < E; ++I) { if (auto *Inst = dyn_cast(VL[I])) if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) || - getTreeEntry(Inst) || (L && (L->contains(Inst)))) && + getTreeEntry(Inst) || + (L && (!Root || L->isLoopInvariant(Root)) && L->contains(Inst))) && PostponedIndices.insert(I).second) PostponedInsts.emplace_back(Inst, I); } @@ -8962,7 +8965,7 @@ Value *BoUpSLP::gather(ArrayRef VL) { Value *Val0 = isa(VL[0]) ? cast(VL[0])->getValueOperand() : VL[0]; FixedVectorType *VecTy = FixedVectorType::get(Val0->getType(), VL.size()); - Value *Vec = PoisonValue::get(VecTy); + Value *Vec = Root ? Root : PoisonValue::get(VecTy); SmallVector NonConsts; // Insert constant values at first. for (int I = 0, E = VL.size(); I < E; ++I) { @@ -8972,6 +8975,18 @@ Value *BoUpSLP::gather(ArrayRef VL) { NonConsts.push_back(I); continue; } + if (Root) { + if (!isa(VL[I])) { + NonConsts.push_back(I); + continue; + } + if (isa(VL[I])) + continue; + if (auto *SV = dyn_cast(Root)) { + if (SV->getMaskValue(I) == UndefMaskElem) + continue; + } + } Vec = CreateInsertElement(Vec, VL[I], I); } // Insert non-constant values. @@ -9210,9 +9225,34 @@ public: add(V1, NewMask); } /// Finalize emission of the shuffles. + /// \param Action the action (if any) to be performed before final applying of + /// the \p ExtMask mask. Value * - finalize(ArrayRef ExtMask = std::nullopt) { + finalize(ArrayRef ExtMask, unsigned VF = 0, + function_ref &)> Action = {}) { IsFinalized = true; + if (Action) { + Value *Vec = InVectors.front(); + if (InVectors.size() == 2) { + Vec = createShuffle(Vec, InVectors.back(), CommonMask); + InVectors.pop_back(); + } else { + Vec = createShuffle(Vec, nullptr, CommonMask); + } + for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx) + if (CommonMask[Idx] != UndefMaskElem) + CommonMask[Idx] = Idx; + assert(VF > 0 && + "Expected vector length for the final value before action."); + unsigned VecVF = cast(Vec->getType())->getNumElements(); + if (VecVF < VF) { + SmallVector ResizeMask(VF, UndefMaskElem); + std::iota(ResizeMask.begin(), std::next(ResizeMask.begin(), VecVF), 0); + Vec = createShuffle(Vec, nullptr, ResizeMask); + } + Action(Vec, CommonMask); + InVectors.front() = Vec; + } if (!ExtMask.empty()) { if (CommonMask.empty()) { CommonMask.assign(ExtMask.begin(), ExtMask.end()); @@ -9378,7 +9418,33 @@ Value *BoUpSLP::createBuildVector(const TreeEntry *E) { inversePermutation(E->ReorderIndices, ReorderMask); if (!ReorderMask.empty()) reorderScalars(GatheredScalars, ReorderMask); - + auto FindReusedSplat = [&](SmallVectorImpl &Mask) { + if (!isSplat(E->Scalars) || none_of(E->Scalars, [](Value *V) { + return isa(V) && !isa(V); + })) + return false; + TreeEntry *UserTE = E->UserTreeIndices.back().UserTE; + unsigned EdgeIdx = E->UserTreeIndices.back().EdgeIdx; + if (UserTE->getNumOperands() != 2) + return false; + auto *It = + find_if(VectorizableTree, [=](const std::unique_ptr &TE) { + return find_if(TE->UserTreeIndices, [=](const EdgeInfo &EI) { + return EI.UserTE == UserTE && EI.EdgeIdx != EdgeIdx; + }) != TE->UserTreeIndices.end(); + }); + if (It == VectorizableTree.end()) + return false; + unsigned I = + *find_if_not(Mask, [](int Idx) { return Idx == UndefMaskElem; }); + int Sz = Mask.size(); + if (all_of(Mask, [Sz](int Idx) { return Idx < 2 * Sz; }) && + ShuffleVectorInst::isIdentityMask(Mask)) + std::iota(Mask.begin(), Mask.end(), 0); + else + std::fill(Mask.begin(), Mask.end(), I); + return true; + }; ShuffleInstructionBuilder ShuffleBuilder(Builder, *this); Value *Vec = nullptr; SmallVector Mask; @@ -9434,8 +9500,9 @@ Value *BoUpSLP::createBuildVector(const TreeEntry *E) { } } } - if ((ExtractShuffle || GatherShuffle) && - all_of(GatheredScalars, PoisonValue::classof)) { + if (ExtractShuffle || GatherShuffle) { + bool IsNonPoisoned = true; + bool IsUsedInExpr = false; Value *Vec1 = nullptr; if (ExtractShuffle) { // Gather of extractelements can be represented as just a shuffle of @@ -9459,22 +9526,97 @@ Value *BoUpSLP::createBuildVector(const TreeEntry *E) { Vec2 = EI->getVectorOperand(); } } - if (Vec2) + if (Vec2) { ShuffleBuilder.add(Vec1, Vec2, ExtractMask); - else if (Vec1) + } else if (Vec1) { ShuffleBuilder.add(Vec1, ExtractMask); - else + IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1); + IsUsedInExpr = FindReusedSplat(ExtractMask); + } else { ShuffleBuilder.add(PoisonValue::get(FixedVectorType::get( ScalarTy, GatheredScalars.size())), ExtractMask); + } } if (GatherShuffle) { - if (Entries.size() == 1) + if (Entries.size() == 1) { ShuffleBuilder.add(Entries.front()->VectorizedValue, Mask); - else + IsNonPoisoned &= + isGuaranteedNotToBePoison(Entries.front()->VectorizedValue); + IsUsedInExpr = FindReusedSplat(Mask); + } else { ShuffleBuilder.add(Entries.front()->VectorizedValue, Entries.back()->VectorizedValue, Mask); + } } + // Try to figure out best way to combine values: build a shuffle and insert + // elements or just build several shuffles. + // Insert non-constant scalars. + SmallVector NonConstants(GatheredScalars); + int EMSz = ExtractMask.size(); + int MSz = Mask.size(); + // Try to build constant vector and shuffle with it only if currently we + // have a single permutation and more than 1 scalar constants. + bool IsSingleShuffle = !ExtractShuffle || !GatherShuffle; + bool IsIdentityShuffle = + (ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc) == + TTI::SK_PermuteSingleSrc && + none_of(ExtractMask, [&](int I) { return I >= EMSz; }) && + ShuffleVectorInst::isIdentityMask(ExtractMask)) || + (GatherShuffle.value_or(TTI::SK_PermuteTwoSrc) == + TTI::SK_PermuteSingleSrc && + none_of(Mask, [&](int I) { return I >= MSz; }) && + ShuffleVectorInst::isIdentityMask(Mask)); + bool EnoughConstsForShuffle = + IsSingleShuffle && + (none_of(GatheredScalars, + [](Value *V) { + return isa(V) && !isa(V); + }) || + any_of(GatheredScalars, + [](Value *V) { + return isa(V) && !isa(V); + })) && + (!IsIdentityShuffle || + (GatheredScalars.size() == 2 && + any_of(GatheredScalars, + [](Value *V) { return !isa(V); })) || + count_if(GatheredScalars, [](Value *V) { + return isa(V) && !isa(V); + }) > 1); + // NonConstants array contains just non-constant values, GatheredScalars + // contains only constant to build final vector and then shuffle. + for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) { + if (EnoughConstsForShuffle && isa(GatheredScalars[I])) + NonConstants[I] = PoisonValue::get(ScalarTy); + else + GatheredScalars[I] = PoisonValue::get(ScalarTy); + } + // Generate constants for final shuffle and build a mask for them. + if (!all_of(GatheredScalars, PoisonValue::classof)) { + SmallVector BVMask(GatheredScalars.size(), UndefMaskElem); + Value *BV = gather(GatheredScalars); + for (int I = 0, Sz = GatheredScalars.size(); I < Sz; ++I) { + if (!isa(GatheredScalars[I])) + BVMask[I] = I; + } + ShuffleBuilder.add(BV, BVMask); + } + if (all_of(NonConstants, [=](Value *V) { + return isa(V) || + (IsSingleShuffle && ((IsIdentityShuffle && + IsNonPoisoned) || IsUsedInExpr) && isa(V)); + })) + Vec = ShuffleBuilder.finalize(E->ReuseShuffleIndices); + else + Vec = ShuffleBuilder.finalize( + E->ReuseShuffleIndices, E->Scalars.size(), + [&](Value *&Vec, SmallVectorImpl &Mask) { + Vec = gather(NonConstants, Vec); + for (unsigned I = 0, Sz = Mask.size(); I < Sz; ++I) + if (!isa(NonConstants[I])) + Mask[I] = I; + }); } else if (!allConstant(E->Scalars)) { // TODO: remove this code once able to combine shuffled vectors and build // vector elements. @@ -9570,13 +9712,14 @@ Value *BoUpSLP::createBuildVector(const TreeEntry *E) { // Gather unique scalars and all constants. Vec = gather(GatheredScalars); ShuffleBuilder.add(Vec, ReuseMask); + Vec = ShuffleBuilder.finalize(E->ReuseShuffleIndices); } else { // Gather all constants. Vec = gather(E->Scalars); ShuffleBuilder.add(Vec, ReuseMask); + Vec = ShuffleBuilder.finalize(E->ReuseShuffleIndices); } - Vec = ShuffleBuilder.finalize(E->ReuseShuffleIndices); if (NeedFreeze) Vec = Builder.CreateFreeze(Vec); return Vec; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/c-ray.ll b/llvm/test/Transforms/SLPVectorizer/X86/c-ray.ll index 6b104f3..b7f161b 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/c-ray.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/c-ray.ll @@ -67,23 +67,23 @@ define i32 @ray_sphere(ptr nocapture noundef readonly %sph, ptr nocapture nounde ; CHECK-NEXT: [[MUL88:%.*]] = fmul double [[TMP4]], 2.000000e+00 ; CHECK-NEXT: [[TMP26:%.*]] = insertelement <2 x double> poison, double [[FNEG87]], i32 0 ; CHECK-NEXT: [[TMP27:%.*]] = insertelement <2 x double> [[TMP26]], double [[CALL]], i32 1 -; CHECK-NEXT: [[TMP28:%.*]] = insertelement <2 x double> poison, double [[CALL]], i32 0 +; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <2 x double> [[TMP27]], <2 x double> poison, <2 x i32> ; CHECK-NEXT: [[TMP29:%.*]] = insertelement <2 x double> [[TMP28]], double [[TMP12]], i32 1 ; CHECK-NEXT: [[TMP30:%.*]] = fsub <2 x double> [[TMP27]], [[TMP29]] ; CHECK-NEXT: [[TMP31:%.*]] = insertelement <2 x double> poison, double [[MUL88]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP31]], <2 x double> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP32:%.*]] = fdiv <2 x double> [[TMP30]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP33:%.*]] = extractelement <2 x double> [[TMP32]], i32 1 -; CHECK-NEXT: [[CMP93:%.*]] = fcmp olt double [[TMP33]], 0x3EB0C6F7A0B5ED8D -; CHECK-NEXT: [[TMP34:%.*]] = extractelement <2 x double> [[TMP32]], i32 0 -; CHECK-NEXT: [[CMP94:%.*]] = fcmp olt double [[TMP34]], 0x3EB0C6F7A0B5ED8D +; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <2 x double> [[TMP31]], <2 x double> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP33:%.*]] = fdiv <2 x double> [[TMP30]], [[TMP32]] +; CHECK-NEXT: [[TMP34:%.*]] = extractelement <2 x double> [[TMP33]], i32 1 +; CHECK-NEXT: [[CMP93:%.*]] = fcmp olt double [[TMP34]], 0x3EB0C6F7A0B5ED8D +; CHECK-NEXT: [[TMP35:%.*]] = extractelement <2 x double> [[TMP33]], i32 0 +; CHECK-NEXT: [[CMP94:%.*]] = fcmp olt double [[TMP35]], 0x3EB0C6F7A0B5ED8D ; CHECK-NEXT: [[OR_COND:%.*]] = select i1 [[CMP93]], i1 [[CMP94]], i1 false ; CHECK-NEXT: br i1 [[OR_COND]], label [[CLEANUP]], label [[LOR_LHS_FALSE:%.*]] ; CHECK: lor.lhs.false: -; CHECK-NEXT: [[TMP35:%.*]] = fcmp ule <2 x double> [[TMP32]], -; CHECK-NEXT: [[TMP36:%.*]] = extractelement <2 x i1> [[TMP35]], i32 0 -; CHECK-NEXT: [[TMP37:%.*]] = extractelement <2 x i1> [[TMP35]], i32 1 -; CHECK-NEXT: [[OR_COND106:%.*]] = select i1 [[TMP37]], i1 true, i1 [[TMP36]] +; CHECK-NEXT: [[TMP36:%.*]] = fcmp ule <2 x double> [[TMP33]], +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <2 x i1> [[TMP36]], i32 0 +; CHECK-NEXT: [[TMP38:%.*]] = extractelement <2 x i1> [[TMP36]], i32 1 +; CHECK-NEXT: [[OR_COND106:%.*]] = select i1 [[TMP38]], i1 true, i1 [[TMP37]] ; CHECK-NEXT: [[SPEC_SELECT:%.*]] = zext i1 [[OR_COND106]] to i32 ; CHECK-NEXT: br label [[CLEANUP]] ; CHECK: cleanup: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll b/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll index 8f59278..4ebb7801 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/commutativity.ll @@ -95,11 +95,10 @@ define void @same_opcode_on_one_side(i32 %a, i32 %b, i32 %c) { ; AVX-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[A:%.*]], i32 0 ; AVX-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> zeroinitializer ; AVX-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]] -; AVX-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[B:%.*]], i32 1 -; AVX-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[C]], i32 2 -; AVX-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> -; AVX-NEXT: [[TMP9:%.*]] = xor <4 x i32> [[TMP5]], [[TMP8]] -; AVX-NEXT: store <4 x i32> [[TMP9]], ptr @cle32, align 16 +; AVX-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP2]], <4 x i32> +; AVX-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[B:%.*]], i32 1 +; AVX-NEXT: [[TMP8:%.*]] = xor <4 x i32> [[TMP5]], [[TMP7]] +; AVX-NEXT: store <4 x i32> [[TMP8]], ptr @cle32, align 16 ; AVX-NEXT: ret void ; %add1 = add i32 %c, %a diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll index f3c9847..4eb63cb 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll @@ -5,23 +5,19 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16 define i1 @foo() { ; CHECK-LABEL: @foo( ; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr null, align 4 -; CHECK-NEXT: br i1 false, label [[TMP15:%.*]], label [[TMP2:%.*]] +; CHECK-NEXT: br i1 false, label [[TMP11:%.*]], label [[TMP2:%.*]] ; CHECK: 2: -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> zeroinitializer, i64 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x float> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = select <4 x i1> zeroinitializer, <4 x float> [[TMP7]], <4 x float> [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = fsub <4 x float> [[TMP11]], zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = fadd <4 x float> [[TMP11]], zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x i32> -; CHECK-NEXT: br label [[TMP15]] -; CHECK: 15: -; CHECK-NEXT: [[TMP16:%.*]] = phi <4 x float> [ [[TMP14]], [[TMP2]] ], [ zeroinitializer, [[TMP0:%.*]] ] +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> , float [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> zeroinitializer, <4 x float> [[TMP5]], <4 x float> [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = fsub <4 x float> [[TMP7]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = fadd <4 x float> [[TMP7]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP9]], <4 x i32> +; CHECK-NEXT: br label [[TMP11]] +; CHECK: 11: +; CHECK-NEXT: [[TMP12:%.*]] = phi <4 x float> [ [[TMP10]], [[TMP2]] ], [ zeroinitializer, [[TMP0:%.*]] ] ; CHECK-NEXT: ret i1 false ; %1 = load float, ptr null, align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_netbsd_decompress.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_netbsd_decompress.ll index f110f2e..12b227c 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_netbsd_decompress.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_netbsd_decompress.ll @@ -23,7 +23,7 @@ define i32 @fn1() { ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr @c, align 4 ; CHECK-NEXT: [[AND:%.*]] = and i32 [[TMP2]], 7 ; CHECK-NEXT: store i32 [[AND]], ptr @a, align 4 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> , <2 x i32> [[TMP0]], <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> , <2 x i32> ; CHECK-NEXT: switch i32 [[AND]], label [[IF_END:%.*]] [ ; CHECK-NEXT: i32 7, label [[SAVE_STATE_AND_RETURN]] ; CHECK-NEXT: i32 0, label [[SAVE_STATE_AND_RETURN]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll index f2acfdd..837b4ff 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll @@ -99,15 +99,7 @@ define void @test() { ; CHECK-NEXT: br i1 undef, label [[IF_THEN78:%.*]], label [[IF_THEN38:%.*]] ; CHECK: if.then38: ; CHECK-NEXT: [[AGG_TMP74663_SROA_0_0_IDX:%.*]] = getelementptr inbounds [[STRUCT_RAY:%.*]], ptr undef, i64 0, i32 1, i32 0 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> , double 6.000000e-02, i32 1 -; CHECK-NEXT: [[TMP1:%.*]] = fmul <2 x double> , [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = fsub <2 x double> , [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> , [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> , [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> , [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> , [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = fmul <2 x double> , [[TMP6]] -; CHECK-NEXT: store <2 x double> [[TMP7]], ptr [[AGG_TMP74663_SROA_0_0_IDX]], align 8 +; CHECK-NEXT: store <2 x double> , ptr [[AGG_TMP74663_SROA_0_0_IDX]], align 8 ; CHECK-NEXT: br label [[IF_THEN78]] ; CHECK: if.then78: ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cse.ll b/llvm/test/Transforms/SLPVectorizer/X86/cse.ll index a520ddd..b5602c7 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/cse.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/cse.ll @@ -19,14 +19,12 @@ define i32 @test(ptr nocapture %G) { ; CHECK-NEXT: [[TMP1:%.*]] = fmul <2 x double> [[TMP0]], ; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x double> [[TMP1]], ; CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[G]], align 8 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds double, ptr [[G]], i64 2 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP0]], i32 1 -; CHECK-NEXT: [[MUL11:%.*]] = fmul double [[TMP4]], 4.000000e+00 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[MUL11]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP6]], -; CHECK-NEXT: store <2 x double> [[TMP7]], ptr [[ARRAYIDX9]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP0]], i32 1 +; CHECK-NEXT: [[MUL11:%.*]] = fmul double [[TMP3]], 4.000000e+00 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP1]], double [[MUL11]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP4]], +; CHECK-NEXT: store <2 x double> [[TMP5]], ptr [[ARRAYIDX9]], align 8 ; CHECK-NEXT: ret i32 undef ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll index a616fd5..6adcf21 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/jumbled-load-multiuse.ll @@ -9,11 +9,10 @@ define i32 @fn1() { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr @b, align 4 ; CHECK-NEXT: [[TMP1:%.*]] = icmp sgt <4 x i32> [[TMP0]], zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> , <4 x i32> [[TMP0]], <4 x i32> -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[SHUFFLE]], <4 x i32> -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: store <4 x i32> [[SHUFFLE1]], ptr @a, align 4 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> , <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr @a, align 4 ; CHECK-NEXT: ret i32 0 ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll index adbeb63..52a84e6 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll @@ -527,11 +527,10 @@ define i1 @foo(float %a, float %b, float %c, <4 x float> %vec, i64 %idx2) { ; SSE-LABEL: @foo( ; SSE-NEXT: [[VECEXT_I291_I166:%.*]] = extractelement <4 x float> [[VEC:%.*]], i64 0 ; SSE-NEXT: [[SUB14_I167:%.*]] = fsub float undef, [[VECEXT_I291_I166]] -; SSE-NEXT: [[VECEXT_I276_I169:%.*]] = extractelement <4 x float> [[VEC]], i64 1 ; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0 ; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[C:%.*]], i32 1 -; SSE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[SUB14_I167]], i32 0 -; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_I276_I169]], i32 1 +; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[VEC]], <4 x float> poison, <2 x i32> +; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[SUB14_I167]], i32 0 ; SSE-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP2]], [[TMP4]] ; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x float> , float [[B:%.*]], i32 0 ; SSE-NEXT: [[TMP7:%.*]] = fsub <2 x float> [[TMP5]], [[TMP6]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll b/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll index 118372d..a117fdc 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/matched-shuffled-entries.ll @@ -15,23 +15,20 @@ define i32 @bar() local_unnamed_addr { ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> [[TMP1]], i32 [[ADD78_1]], i32 6 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x i32> [[TMP2]], i32 [[SUB86_1]], i32 7 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x i32> [[TMP3]], i32 [[ADD78_2]], i32 9 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i32> , i32 [[SUB86_1]], i32 4 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i32> [[TMP5]], i32 [[ADD78_1]], i32 5 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x i32> [[TMP6]], i32 [[ADD94_1]], i32 6 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[SUB102_1]], i32 7 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[SUB102_3]], i32 12 -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <16 x i32> [[TMP9]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = add nsw <16 x i32> [[SHUFFLE]], [[SHUFFLE1]] -; CHECK-NEXT: [[TMP11:%.*]] = sub nsw <16 x i32> [[SHUFFLE]], [[SHUFFLE1]] -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <16 x i32> [[TMP10]], <16 x i32> [[TMP11]], <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = lshr <16 x i32> [[TMP12]], -; CHECK-NEXT: [[TMP14:%.*]] = and <16 x i32> [[TMP13]], -; CHECK-NEXT: [[TMP15:%.*]] = mul nuw <16 x i32> [[TMP14]], -; CHECK-NEXT: [[TMP16:%.*]] = add <16 x i32> [[TMP15]], [[TMP12]] -; CHECK-NEXT: [[TMP17:%.*]] = xor <16 x i32> [[TMP16]], [[TMP15]] -; CHECK-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP17]]) -; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP18]], 16 +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x i32> [[TMP6]], i32 [[SUB102_3]], i32 12 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[SUB102_3]], i32 15 +; CHECK-NEXT: [[TMP9:%.*]] = add nsw <16 x i32> [[TMP5]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = sub nsw <16 x i32> [[TMP5]], [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <16 x i32> [[TMP9]], <16 x i32> [[TMP10]], <16 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = lshr <16 x i32> [[TMP11]], +; CHECK-NEXT: [[TMP13:%.*]] = and <16 x i32> [[TMP12]], +; CHECK-NEXT: [[TMP14:%.*]] = mul nuw <16 x i32> [[TMP13]], +; CHECK-NEXT: [[TMP15:%.*]] = add <16 x i32> [[TMP14]], [[TMP11]] +; CHECK-NEXT: [[TMP16:%.*]] = xor <16 x i32> [[TMP15]], [[TMP14]] +; CHECK-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP16]]) +; CHECK-NEXT: [[SHR:%.*]] = lshr i32 [[TMP17]], 16 ; CHECK-NEXT: [[ADD119:%.*]] = add nuw nsw i32 undef, [[SHR]] ; CHECK-NEXT: [[SHR120:%.*]] = lshr i32 [[ADD119]], 1 ; CHECK-NEXT: ret i32 [[SHR120]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll b/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll index d417dd2..e5947dc 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/memory-runtime-checks.ll @@ -187,9 +187,9 @@ define void @gather_sequence_crash(<2 x float> %arg, ptr %arg1, float %arg2, ptr ; CHECK-NEXT: br i1 [[C_1:%.*]], label [[BB16:%.*]], label [[BB6:%.*]] ; CHECK: bb6: ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[ARG1:%.*]], i32 3 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x float> , float [[ARG2:%.*]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x float> [[ARG:%.*]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> [[TMP1]], <4 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <2 x float> [[ARG:%.*]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> , <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float [[ARG2:%.*]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = fmul <4 x float> [[TMP2]], zeroinitializer ; CHECK-NEXT: store <4 x float> [[TMP3]], ptr [[TMP8]], align 4 ; CHECK-NEXT: ret void @@ -226,9 +226,9 @@ define void @gather_sequence_crash(<2 x float> %arg, ptr %arg1, float %arg2, ptr ; CHECK-NEXT: [[TMP38:%.*]] = fadd float 0.000000e+00, [[TMP37]] ; CHECK-NEXT: store float [[TMP38]], ptr [[TMP35]], align 4 ; CHECK-NEXT: [[TMP39:%.*]] = getelementptr float, ptr [[ARG4]], i64 1 -; CHECK-NEXT: [[TMP7:%.*]] = load <2 x float>, ptr [[TMP39]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x float> zeroinitializer, [[TMP7]] -; CHECK-NEXT: store <2 x float> [[TMP8]], ptr [[TMP39]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[TMP39]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x float> zeroinitializer, [[TMP4]] +; CHECK-NEXT: store <2 x float> [[TMP5]], ptr [[TMP39]], align 4 ; CHECK-NEXT: [[TMP44:%.*]] = load float, ptr [[ARG3:%.*]], align 4 ; CHECK-NEXT: [[TMP45:%.*]] = load float, ptr [[ARG4]], align 4 ; CHECK-NEXT: [[TMP46:%.*]] = fadd float 0.000000e+00, [[TMP45]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll index 5bac13d..359c074 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll @@ -9,19 +9,19 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f3 define void @shuffle_operands1(ptr noalias %from, ptr noalias %to, double %v1, double %v2) { ; CHECK-LABEL: @shuffle_operands1( -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[V1:%.*]], i64 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[V2:%.*]], i64 1 -; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]] -; CHECK-NEXT: store <2 x double> [[TMP5]], ptr [[TO:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V1:%.*]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[V2:%.*]], i64 1 +; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] +; CHECK-NEXT: store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4 ; CHECK-NEXT: ret void ; ; SSE2-LABEL: @shuffle_operands1( -; SSE2-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 -; SSE2-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[V1:%.*]], i64 0 -; SSE2-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[V2:%.*]], i64 1 -; SSE2-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]] -; SSE2-NEXT: store <2 x double> [[TMP5]], ptr [[TO:%.*]], align 4 +; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 +; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[V1:%.*]], i64 0 +; SSE2-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[V2:%.*]], i64 1 +; SSE2-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] +; SSE2-NEXT: store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4 ; SSE2-NEXT: ret void ; %from_1 = getelementptr double, ptr %from, i64 1 @@ -41,11 +41,11 @@ define void @vecload_vs_broadcast(ptr noalias %from, ptr noalias %to, double %v1 ; CHECK-NEXT: br label [[LP:%.*]] ; CHECK: lp: ; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[P]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP1]], <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] -; CHECK-NEXT: store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP0]], [[TMP2]] +; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4 ; CHECK-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]] ; CHECK: ext: ; CHECK-NEXT: ret void @@ -55,11 +55,11 @@ define void @vecload_vs_broadcast(ptr noalias %from, ptr noalias %to, double %v1 ; SSE2-NEXT: br label [[LP:%.*]] ; SSE2: lp: ; SSE2-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] -; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 -; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[P]], i64 0 -; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP1]], <2 x i32> -; SSE2-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] -; SSE2-NEXT: store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4 +; SSE2-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 +; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> +; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0 +; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP0]], [[TMP2]] +; SSE2-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4 ; SSE2-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]] ; SSE2: ext: ; SSE2-NEXT: ret void @@ -89,11 +89,11 @@ define void @vecload_vs_broadcast2(ptr noalias %from, ptr noalias %to, double %v ; CHECK-NEXT: br label [[LP:%.*]] ; CHECK: lp: ; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[P]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP1]], <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], [[TMP1]] -; CHECK-NEXT: store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP0]] +; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4 ; CHECK-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]] ; CHECK: ext: ; CHECK-NEXT: ret void @@ -103,11 +103,11 @@ define void @vecload_vs_broadcast2(ptr noalias %from, ptr noalias %to, double %v ; SSE2-NEXT: br label [[LP:%.*]] ; SSE2: lp: ; SSE2-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] -; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 -; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[P]], i64 0 -; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP1]], <2 x i32> -; SSE2-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], [[TMP1]] -; SSE2-NEXT: store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4 +; SSE2-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 +; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> +; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0 +; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP0]] +; SSE2-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4 ; SSE2-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]] ; SSE2: ext: ; SSE2-NEXT: ret void @@ -137,11 +137,11 @@ define void @vecload_vs_broadcast3(ptr noalias %from, ptr noalias %to, double %v ; CHECK-NEXT: br label [[LP:%.*]] ; CHECK: lp: ; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[P]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP1]], <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], [[TMP1]] -; CHECK-NEXT: store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP0]] +; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4 ; CHECK-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]] ; CHECK: ext: ; CHECK-NEXT: ret void @@ -151,11 +151,11 @@ define void @vecload_vs_broadcast3(ptr noalias %from, ptr noalias %to, double %v ; SSE2-NEXT: br label [[LP:%.*]] ; SSE2: lp: ; SSE2-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] -; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 -; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[P]], i64 0 -; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP1]], <2 x i32> -; SSE2-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], [[TMP1]] -; SSE2-NEXT: store <2 x double> [[TMP4]], ptr [[TO:%.*]], align 4 +; SSE2-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 +; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> +; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 0 +; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP0]] +; SSE2-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4 ; SSE2-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]] ; SSE2: ext: ; SSE2-NEXT: ret void @@ -185,10 +185,10 @@ define void @shuffle_nodes_match1(ptr noalias %from, ptr noalias %to, double %v1 ; CHECK-NEXT: br label [[LP:%.*]] ; CHECK: lp: ; CHECK-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 1 -; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[SHUFFLE]] +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP1]] ; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4 ; CHECK-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]] ; CHECK: ext: @@ -199,10 +199,10 @@ define void @shuffle_nodes_match1(ptr noalias %from, ptr noalias %to, double %v1 ; SSE2-NEXT: br label [[LP:%.*]] ; SSE2: lp: ; SSE2-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] -; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 -; SSE2-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> -; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 1 -; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[SHUFFLE]] +; SSE2-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 +; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> +; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1 +; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP1]] ; SSE2-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4 ; SSE2-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]] ; SSE2: ext: @@ -251,10 +251,10 @@ define void @vecload_vs_broadcast4(ptr noalias %from, ptr noalias %to, double %v ; SSE2-NEXT: br label [[LP:%.*]] ; SSE2: lp: ; SSE2-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] -; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 -; SSE2-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> -; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 1 -; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[SHUFFLE]] +; SSE2-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 +; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> +; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1 +; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], [[TMP1]] ; SSE2-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4 ; SSE2-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]] ; SSE2: ext: @@ -304,10 +304,10 @@ define void @shuffle_nodes_match2(ptr noalias %from, ptr noalias %to, double %v1 ; SSE2-NEXT: br label [[LP:%.*]] ; SSE2: lp: ; SSE2-NEXT: [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ] -; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 -; SSE2-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> -; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[P]], i64 1 -; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[SHUFFLE]], [[TMP2]] +; SSE2-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[FROM:%.*]], align 4 +; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <2 x i32> +; SSE2-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP0]], double [[P]], i64 1 +; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] ; SSE2-NEXT: store <2 x double> [[TMP3]], ptr [[TO:%.*]], align 4 ; SSE2-NEXT: br i1 undef, label [[LP]], label [[EXT:%.*]] ; SSE2: ext: @@ -345,7 +345,7 @@ define void @good_load_order() { ; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr @a, align 16 ; CHECK-NEXT: br label [[FOR_BODY3:%.*]] ; CHECK: for.body3: -; CHECK-NEXT: [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP14:%.*]], [[FOR_BODY3]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP12:%.*]], [[FOR_BODY3]] ] ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY3]] ] ; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[INDVARS_IV]] to i32 ; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], 1 @@ -355,20 +355,20 @@ define void @good_load_order() { ; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[INDVARS_IV]] to i32 ; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], 4 ; CHECK-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i64 0 -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP8]], <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = fmul <4 x float> [[TMP8]], [[TMP10]] -; CHECK-NEXT: store <4 x float> [[TMP11]], ptr [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP10:%.*]] = fmul <4 x float> [[TMP7]], [[TMP9]] +; CHECK-NEXT: store <4 x float> [[TMP10]], ptr [[ARRAYIDX5]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 5 -; CHECK-NEXT: [[TMP13:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP13]] -; CHECK-NEXT: [[TMP14]] = load float, ptr [[ARRAYIDX41]], align 4 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP8]], i64 3 -; CHECK-NEXT: [[MUL45:%.*]] = fmul float [[TMP14]], [[TMP15]] +; CHECK-NEXT: [[TMP11:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP11]] +; CHECK-NEXT: [[TMP12]] = load float, ptr [[ARRAYIDX41]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP7]], i64 3 +; CHECK-NEXT: [[MUL45:%.*]] = fmul float [[TMP12]], [[TMP13]] ; CHECK-NEXT: store float [[MUL45]], ptr [[ARRAYIDX31]], align 4 -; CHECK-NEXT: [[TMP16:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP16]], 31995 +; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP14]], 31995 ; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_BODY3]], label [[FOR_END:%.*]] ; CHECK: for.end: ; CHECK-NEXT: ret void @@ -380,7 +380,7 @@ define void @good_load_order() { ; SSE2-NEXT: [[TMP0:%.*]] = load float, ptr @a, align 16 ; SSE2-NEXT: br label [[FOR_BODY3:%.*]] ; SSE2: for.body3: -; SSE2-NEXT: [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP14:%.*]], [[FOR_BODY3]] ] +; SSE2-NEXT: [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP12:%.*]], [[FOR_BODY3]] ] ; SSE2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY3]] ] ; SSE2-NEXT: [[TMP2:%.*]] = trunc i64 [[INDVARS_IV]] to i32 ; SSE2-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], 1 @@ -390,20 +390,20 @@ define void @good_load_order() { ; SSE2-NEXT: [[TMP5:%.*]] = trunc i64 [[INDVARS_IV]] to i32 ; SSE2-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], 4 ; SSE2-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP6]] -; SSE2-NEXT: [[TMP8:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4 -; SSE2-NEXT: [[TMP9:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i64 0 -; SSE2-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP8]], <4 x i32> -; SSE2-NEXT: [[TMP11:%.*]] = fmul <4 x float> [[TMP8]], [[TMP10]] -; SSE2-NEXT: store <4 x float> [[TMP11]], ptr [[ARRAYIDX5]], align 4 +; SSE2-NEXT: [[TMP7:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4 +; SSE2-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> poison, <4 x i32> +; SSE2-NEXT: [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[TMP1]], i64 0 +; SSE2-NEXT: [[TMP10:%.*]] = fmul <4 x float> [[TMP7]], [[TMP9]] +; SSE2-NEXT: store <4 x float> [[TMP10]], ptr [[ARRAYIDX5]], align 4 ; SSE2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 5 -; SSE2-NEXT: [[TMP13:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; SSE2-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP13]] -; SSE2-NEXT: [[TMP14]] = load float, ptr [[ARRAYIDX41]], align 4 -; SSE2-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP8]], i64 3 -; SSE2-NEXT: [[MUL45:%.*]] = fmul float [[TMP14]], [[TMP15]] +; SSE2-NEXT: [[TMP11:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; SSE2-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i32 0, i32 [[TMP11]] +; SSE2-NEXT: [[TMP12]] = load float, ptr [[ARRAYIDX41]], align 4 +; SSE2-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP7]], i64 3 +; SSE2-NEXT: [[MUL45:%.*]] = fmul float [[TMP12]], [[TMP13]] ; SSE2-NEXT: store float [[MUL45]], ptr [[ARRAYIDX31]], align 4 -; SSE2-NEXT: [[TMP16:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; SSE2-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP16]], 31995 +; SSE2-NEXT: [[TMP14:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; SSE2-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP14]], 31995 ; SSE2-NEXT: br i1 [[CMP2]], label [[FOR_BODY3]], label [[FOR_END:%.*]] ; SSE2: for.end: ; SSE2-NEXT: ret void @@ -458,17 +458,17 @@ for.end: define void @load_reorder_double(ptr nocapture %c, ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b){ ; CHECK-LABEL: @load_reorder_double( -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[B:%.*]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = load <2 x double>, ptr [[A:%.*]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]] -; CHECK-NEXT: store <2 x double> [[TMP5]], ptr [[C:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] +; CHECK-NEXT: store <2 x double> [[TMP3]], ptr [[C:%.*]], align 4 ; CHECK-NEXT: ret void ; ; SSE2-LABEL: @load_reorder_double( -; SSE2-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[B:%.*]], align 4 -; SSE2-NEXT: [[TMP4:%.*]] = load <2 x double>, ptr [[A:%.*]], align 4 -; SSE2-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]] -; SSE2-NEXT: store <2 x double> [[TMP5]], ptr [[C:%.*]], align 4 +; SSE2-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B:%.*]], align 4 +; SSE2-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[A:%.*]], align 4 +; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] +; SSE2-NEXT: store <2 x double> [[TMP3]], ptr [[C:%.*]], align 4 ; SSE2-NEXT: ret void ; %1 = load double, ptr %a @@ -493,17 +493,17 @@ define void @load_reorder_double(ptr nocapture %c, ptr noalias nocapture readonl define void @load_reorder_float(ptr nocapture %c, ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b){ ; CHECK-LABEL: @load_reorder_float( -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[A:%.*]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]] -; CHECK-NEXT: store <4 x float> [[TMP5]], ptr [[C:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]] +; CHECK-NEXT: store <4 x float> [[TMP3]], ptr [[C:%.*]], align 4 ; CHECK-NEXT: ret void ; ; SSE2-LABEL: @load_reorder_float( -; SSE2-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[A:%.*]], align 4 -; SSE2-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4 -; SSE2-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]] -; SSE2-NEXT: store <4 x float> [[TMP5]], ptr [[C:%.*]], align 4 +; SSE2-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[A:%.*]], align 4 +; SSE2-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4 +; SSE2-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]] +; SSE2-NEXT: store <4 x float> [[TMP3]], ptr [[C:%.*]], align 4 ; SSE2-NEXT: ret void ; %1 = load float, ptr %a @@ -542,21 +542,21 @@ define void @load_reorder_float(ptr nocapture %c, ptr noalias nocapture readonly define void @opcode_reorder(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, ptr noalias nocapture readonly %c,ptr noalias nocapture readonly %d) { ; CHECK-LABEL: @opcode_reorder( -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[C:%.*]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP7:%.*]] = load <4 x float>, ptr [[D:%.*]], align 4 -; CHECK-NEXT: [[TMP8:%.*]] = fadd <4 x float> [[TMP7]], [[TMP5]] -; CHECK-NEXT: store <4 x float> [[TMP8]], ptr [[A:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[C:%.*]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[D:%.*]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[TMP4]], [[TMP3]] +; CHECK-NEXT: store <4 x float> [[TMP5]], ptr [[A:%.*]], align 4 ; CHECK-NEXT: ret void ; ; SSE2-LABEL: @opcode_reorder( -; SSE2-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4 -; SSE2-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[C:%.*]], align 4 -; SSE2-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]] -; SSE2-NEXT: [[TMP7:%.*]] = load <4 x float>, ptr [[D:%.*]], align 4 -; SSE2-NEXT: [[TMP8:%.*]] = fadd <4 x float> [[TMP7]], [[TMP5]] -; SSE2-NEXT: store <4 x float> [[TMP8]], ptr [[A:%.*]], align 4 +; SSE2-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[B:%.*]], align 4 +; SSE2-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[C:%.*]], align 4 +; SSE2-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]] +; SSE2-NEXT: [[TMP4:%.*]] = load <4 x float>, ptr [[D:%.*]], align 4 +; SSE2-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[TMP4]], [[TMP3]] +; SSE2-NEXT: store <4 x float> [[TMP5]], ptr [[A:%.*]], align 4 ; SSE2-NEXT: ret void ; %1 = load float, ptr %b diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll index 4f2c9564..57a9238 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll @@ -26,11 +26,11 @@ define i32 @foo(ptr nocapture %A, i32 %k) { ; CHECK-NEXT: br i1 [[TOBOOL]], label [[IF_ELSE:%.*]], label [[IF_END:%.*]] ; CHECK: if.else: ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 10 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: br label [[IF_END]] ; CHECK: if.end: -; CHECK-NEXT: [[TMP2:%.*]] = phi <2 x double> [ [[TMP1]], [[IF_ELSE]] ], [ , [[ENTRY:%.*]] ] -; CHECK-NEXT: store <2 x double> [[TMP2]], ptr [[A]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x double> [ [[TMP0]], [[IF_ELSE]] ], [ , [[ENTRY:%.*]] ] +; CHECK-NEXT: store <2 x double> [[TMP1]], ptr [[A]], align 8 ; CHECK-NEXT: ret i32 undef ; entry: @@ -73,19 +73,19 @@ if.end: ; preds = %entry, %if.else define i32 @foo2(ptr noalias nocapture %B, ptr noalias nocapture %A, i32 %n, i32 %m) #0 { ; CHECK-LABEL: @foo2( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[A:%.*]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A:%.*]], align 8 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[I_019:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP2:%.*]] = phi <2 x double> [ [[TMP1]], [[ENTRY]] ], [ [[TMP5:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], -; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP3]], -; CHECK-NEXT: [[TMP5]] = fadd <2 x double> [[TMP4]], +; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x double> [ [[TMP0]], [[ENTRY]] ], [ [[TMP4:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x double> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP2]], +; CHECK-NEXT: [[TMP4]] = fadd <2 x double> [[TMP3]], ; CHECK-NEXT: [[INC]] = add nsw i32 [[I_019]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 100 ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] ; CHECK: for.end: -; CHECK-NEXT: store <2 x double> [[TMP5]], ptr [[B:%.*]], align 8 +; CHECK-NEXT: store <2 x double> [[TMP4]], ptr [[B:%.*]], align 8 ; CHECK-NEXT: ret i32 0 ; entry: @@ -138,41 +138,40 @@ define float @foo3(ptr nocapture readonly %A) #0 { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[A:%.*]], align 4 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1 -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> [[TMP4]], float [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP0]], i32 0 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[R_052:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = phi <4 x float> [ [[TMP2]], [[ENTRY]] ], [ [[TMP16:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP7:%.*]] = phi <2 x float> [ [[TMP5]], [[ENTRY]] ], [ [[TMP12:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP7]], i32 0 -; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP8]], 7.000000e+00 +; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x float> [ [[TMP1]], [[ENTRY]] ], [ [[TMP13:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x float> [ [[TMP3]], [[ENTRY]] ], [ [[TMP9:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x float> [[TMP5]], i32 0 +; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP6]], 7.000000e+00 ; CHECK-NEXT: [[ADD6]] = fadd float [[R_052]], [[MUL]] -; CHECK-NEXT: [[TMP9:%.*]] = add nsw i64 [[INDVARS_IV]], 2 -; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr [[ARRAYIDX14]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = add nsw i64 [[INDVARS_IV]], 2 +; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]] +; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX14]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 3 ; CHECK-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV_NEXT]] -; CHECK-NEXT: [[TMP12]] = load <2 x float>, ptr [[ARRAYIDX19]], align 4 -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> [[TMP12]], <4 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x float> [[TMP13]], float [[TMP10]], i32 1 -; CHECK-NEXT: [[TMP15:%.*]] = fmul <4 x float> [[TMP14]], -; CHECK-NEXT: [[TMP16]] = fadd <4 x float> [[TMP6]], [[TMP15]] -; CHECK-NEXT: [[TMP17:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP17]], 121 +; CHECK-NEXT: [[TMP9]] = load <2 x float>, ptr [[ARRAYIDX19]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> [[TMP9]], <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP8]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = fmul <4 x float> [[TMP11]], +; CHECK-NEXT: [[TMP13]] = fadd <4 x float> [[TMP4]], [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP14]], 121 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]] ; CHECK: for.end: -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x float> [[TMP16]], i32 0 -; CHECK-NEXT: [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP18]] -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <4 x float> [[TMP16]], i32 1 -; CHECK-NEXT: [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP19]] -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x float> [[TMP16]], i32 2 -; CHECK-NEXT: [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP20]] -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x float> [[TMP16]], i32 3 -; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP21]] +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP13]], i32 0 +; CHECK-NEXT: [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP15]] +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x float> [[TMP13]], i32 1 +; CHECK-NEXT: [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP16]] +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[TMP13]], i32 2 +; CHECK-NEXT: [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP17]] +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x float> [[TMP13]], i32 3 +; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP18]] ; CHECK-NEXT: ret float [[ADD31]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll index 98df52a..9c7e8f6 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll @@ -68,14 +68,13 @@ define void @pr35497() local_unnamed_addr #0 { ; SSE-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <2 x i32> ; SSE-NEXT: [[TMP5:%.*]] = add nuw nsw <2 x i64> [[TMP4]], zeroinitializer ; SSE-NEXT: store <2 x i64> [[TMP5]], ptr undef, align 1 -; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> poison, i64 [[ADD]], i32 0 -; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> [[TMP5]], <2 x i32> -; SSE-NEXT: [[TMP8:%.*]] = shl <2 x i64> [[TMP7]], -; SSE-NEXT: [[TMP9:%.*]] = and <2 x i64> [[TMP8]], -; SSE-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP9]], <2 x i64> poison, <2 x i32> -; SSE-NEXT: [[TMP11:%.*]] = lshr <2 x i64> [[TMP5]], -; SSE-NEXT: [[TMP12:%.*]] = add nuw nsw <2 x i64> [[TMP10]], [[TMP11]] -; SSE-NEXT: store <2 x i64> [[TMP12]], ptr [[ARRAYIDX2_2]], align 1 +; SSE-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[ADD]], i32 0 +; SSE-NEXT: [[TMP7:%.*]] = shl <2 x i64> [[TMP6]], +; SSE-NEXT: [[TMP8:%.*]] = and <2 x i64> [[TMP7]], +; SSE-NEXT: [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> poison, <2 x i32> +; SSE-NEXT: [[TMP10:%.*]] = lshr <2 x i64> [[TMP5]], +; SSE-NEXT: [[TMP11:%.*]] = add nuw nsw <2 x i64> [[TMP9]], [[TMP10]] +; SSE-NEXT: store <2 x i64> [[TMP11]], ptr [[ARRAYIDX2_2]], align 1 ; SSE-NEXT: ret void ; ; AVX-LABEL: @pr35497( @@ -89,14 +88,13 @@ define void @pr35497() local_unnamed_addr #0 { ; AVX-NEXT: [[TMP3:%.*]] = and <2 x i64> [[TMP2]], ; AVX-NEXT: [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], zeroinitializer ; AVX-NEXT: store <2 x i64> [[TMP4]], ptr undef, align 1 -; AVX-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> poison, i64 [[ADD]], i32 0 -; AVX-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP4]], <2 x i32> -; AVX-NEXT: [[TMP7:%.*]] = shl <2 x i64> [[TMP6]], -; AVX-NEXT: [[TMP8:%.*]] = and <2 x i64> [[TMP7]], -; AVX-NEXT: [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> poison, <2 x i32> -; AVX-NEXT: [[TMP10:%.*]] = lshr <2 x i64> [[TMP4]], -; AVX-NEXT: [[TMP11:%.*]] = add nuw nsw <2 x i64> [[TMP9]], [[TMP10]] -; AVX-NEXT: store <2 x i64> [[TMP11]], ptr [[ARRAYIDX2_2]], align 1 +; AVX-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 [[ADD]], i32 0 +; AVX-NEXT: [[TMP6:%.*]] = shl <2 x i64> [[TMP5]], +; AVX-NEXT: [[TMP7:%.*]] = and <2 x i64> [[TMP6]], +; AVX-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP7]], <2 x i64> poison, <2 x i32> +; AVX-NEXT: [[TMP9:%.*]] = lshr <2 x i64> [[TMP4]], +; AVX-NEXT: [[TMP10:%.*]] = add nuw nsw <2 x i64> [[TMP8]], [[TMP9]] +; AVX-NEXT: store <2 x i64> [[TMP10]], ptr [[ARRAYIDX2_2]], align 1 ; AVX-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll index 0be5e2d..97fb568 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-logical.ll @@ -329,19 +329,12 @@ define i1 @logical_and_icmp_clamp_extra_use_select(<4 x i32> %x) { define i1 @logical_and_icmp_clamp_v8i32(<8 x i32> %x, <8 x i32> %y) { ; CHECK-LABEL: @logical_and_icmp_clamp_v8i32( -; CHECK-NEXT: [[Y0:%.*]] = extractelement <8 x i32> [[Y:%.*]], i32 0 -; CHECK-NEXT: [[Y1:%.*]] = extractelement <8 x i32> [[Y]], i32 1 -; CHECK-NEXT: [[Y2:%.*]] = extractelement <8 x i32> [[Y]], i32 2 -; CHECK-NEXT: [[Y3:%.*]] = extractelement <8 x i32> [[Y]], i32 3 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[X:%.*]], <8 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> , i32 [[Y0]], i32 4 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[Y1]], i32 5 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[Y2]], i32 6 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[Y3]], i32 7 -; CHECK-NEXT: [[TMP6:%.*]] = icmp slt <8 x i32> [[TMP1]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = freeze <8 x i1> [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP7]]) -; CHECK-NEXT: ret i1 [[TMP8]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[Y:%.*]], <8 x i32> , <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = icmp slt <8 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = freeze <8 x i1> [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP4]]) +; CHECK-NEXT: ret i1 [[TMP5]] ; %x0 = extractelement <8 x i32> %x, i32 0 %x1 = extractelement <8 x i32> %x, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll index ffa052f..458f4f9 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction2.ll @@ -89,7 +89,7 @@ define i1 @fcmp_lt_gt(double %a, double %b, double %c) { ; CHECK-NEXT: [[MUL:%.*]] = fmul double [[A:%.*]], 2.000000e+00 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i32 1 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[FNEG]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[C]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[B]], i32 1 ; CHECK-NEXT: [[TMP4:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]] ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[MUL]], i32 0 @@ -138,7 +138,7 @@ define i1 @fcmp_lt(double %a, double %b, double %c) { ; CHECK-NEXT: [[MUL:%.*]] = fmul double [[A:%.*]], 2.000000e+00 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[C:%.*]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[FNEG]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[C]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> poison, <2 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[B]], i32 1 ; CHECK-NEXT: [[TMP5:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[MUL]], i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-clustered-node.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-clustered-node.ll index 72faaa9..1657850 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-clustered-node.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-clustered-node.ll @@ -14,17 +14,16 @@ define i1 @test(ptr %arg, ptr %i233, i64 %i241, ptr %i235, ptr %i237, ptr %i227) ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x ptr> , ptr [[I242]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> [[TMP2]], ptr [[I250]], i32 2 ; CHECK-NEXT: [[TMP4:%.*]] = icmp ult <8 x ptr> [[TMP3]], [[TMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x ptr> poison, ptr [[I250]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x ptr> [[TMP5]], ptr [[I242]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x ptr> [[TMP6]], ptr [[I245]], i32 2 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x ptr> [[TMP7]], ptr [[I248]], i32 3 -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x ptr> [[TMP8]], <8 x ptr> poison, <8 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x ptr> , <8 x ptr> [[TMP1]], <8 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = icmp ult <8 x ptr> [[TMP9]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i1> [[TMP11]], <8 x i1> poison, <8 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = or <8 x i1> [[TMP4]], [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP13]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = and i1 [[TMP14]], false +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x ptr> [[TMP5]], ptr [[I245]], i32 2 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x ptr> [[TMP6]], ptr [[I248]], i32 3 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x ptr> [[TMP7]], <4 x ptr> poison, <8 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> , <8 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = icmp ult <8 x ptr> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <8 x i1> [[TMP10]], <8 x i1> poison, <8 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = or <8 x i1> [[TMP4]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP12]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = and i1 [[TMP13]], false ; CHECK-NEXT: ret i1 [[OP_RDX]] ; bb: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll index b182df5..035bc3e 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll @@ -10,7 +10,7 @@ define void @test(ptr noalias %0, ptr %p) { ; CHECK-NEXT: [[TMP6:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> [[TMP4]], i32 4, <8 x i1> , <8 x float> poison) ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x float> [[TMP6]], <8 x float> poison, <16 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x float> [[TMP6]], <8 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x float> , <16 x float> [[TMP8]], <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x float> [[TMP8]], <16 x float> , <16 x i32> ; CHECK-NEXT: [[TMP10:%.*]] = fadd reassoc nsz arcp contract afn <16 x float> [[TMP7]], [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <16 x float> [[TMP10]], <16 x float> poison, <16 x i32> ; CHECK-NEXT: store <16 x float> [[TMP11]], ptr [[TMP5]], align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/root-trunc-extract-reuse.ll b/llvm/test/Transforms/SLPVectorizer/X86/root-trunc-extract-reuse.ll index f996040..ccde383 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/root-trunc-extract-reuse.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/root-trunc-extract-reuse.ll @@ -17,9 +17,8 @@ define i1 @test() { ; CHECK-NEXT: [[T13:%.*]] = and <2 x i32> [[TMP4]], zeroinitializer ; CHECK-NEXT: br label [[ELSE1:%.*]] ; CHECK: else1: -; CHECK-NEXT: [[T20:%.*]] = extractelement <2 x i32> [[T13]], i64 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[BF_CAST162]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[T20]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[T13]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[BF_CAST162]], i32 0 ; CHECK-NEXT: [[TMP7:%.*]] = icmp ugt <2 x i32> [[TMP6]], zeroinitializer ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 ; CHECK-NEXT: ret i1 [[TMP8]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll index 284737b..c79e9b9 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll @@ -12,10 +12,10 @@ define void @test() { ; CHECK-NEXT: [[TMP1:%.*]] = fsub <2 x float> zeroinitializer, [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX10_I_I86]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr undef, align 4 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> , <2 x float> [[TMP0]], <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> , <2 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x float> , float [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> , <2 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP6]], <2 x float> [[TMP7]]) ; CHECK-NEXT: br i1 false, label [[BB2:%.*]], label [[BB3:%.*]] ; CHECK: bb2: @@ -23,12 +23,12 @@ define void @test() { ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: ; CHECK-NEXT: [[TMP10:%.*]] = phi <2 x float> [ [[TMP9]], [[BB2]] ], [ zeroinitializer, [[BB1]] ] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = fadd <2 x float> [[TMP1]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP12:%.*]] = fadd <2 x float> [[TMP11]], zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = fsub <2 x float> [[TMP12]], zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = fadd <2 x float> [[TMP1]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = fadd <2 x float> [[TMP12]], zeroinitializer ; CHECK-NEXT: [[TMP14:%.*]] = fsub <2 x float> [[TMP13]], zeroinitializer -; CHECK-NEXT: store <2 x float> [[TMP14]], ptr [[ARRAYIDX21_I]], align 16 +; CHECK-NEXT: [[TMP15:%.*]] = fsub <2 x float> [[TMP14]], zeroinitializer +; CHECK-NEXT: store <2 x float> [[TMP15]], ptr [[ARRAYIDX21_I]], align 16 ; CHECK-NEXT: ret void ; entry: -- 2.7.4