From 2faacf61a50e7f23fd10927cbbb98c59799bfcd0 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Thu, 9 Dec 2021 10:34:08 -0800 Subject: [PATCH] [SLP]Improve shuffles cost estimation where possible. Improved/fixed cost modeling for shuffles by providing masks, improved cost model for non-identity insertelements. Differential Revision: https://reviews.llvm.org/D115462 --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 119 +++++++++++++++------ .../accelerate-vector-functions-inseltpoison.ll | 96 ++++++++--------- .../AArch64/accelerate-vector-functions.ll | 96 ++++++++--------- .../SLPVectorizer/AArch64/buildvector-vectorize.ll | 21 ++++ .../Transforms/SLPVectorizer/AArch64/tsc-s116.ll | 29 ++--- llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll | 110 ++++++------------- .../X86/alternate-int-inseltpoison.ll | 24 +++-- .../Transforms/SLPVectorizer/X86/alternate-int.ll | 24 +++-- .../SLPVectorizer/X86/load-merge-inseltpoison.ll | 13 ++- .../Transforms/SLPVectorizer/X86/load-merge.ll | 13 ++- .../Transforms/SLPVectorizer/X86/malformed_phis.ll | 34 ++---- llvm/test/Transforms/SLPVectorizer/X86/resched.ll | 43 ++++---- 12 files changed, 319 insertions(+), 303 deletions(-) create mode 100644 llvm/test/Transforms/SLPVectorizer/AArch64/buildvector-vectorize.ll diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 168b5c1..d7769ef 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -5685,10 +5685,6 @@ computeExtractCost(ArrayRef VL, FixedVectorType *VecTy, for (auto *V : VL) { ++Idx; - // Need to exclude undefs from analysis. - if (isa(V) || Mask[Idx] == UndefMaskElem) - continue; - // Reached the start of a new vector registers. if (Idx % EltsPerVector == 0) { RegMask.assign(EltsPerVector, UndefMaskElem); @@ -5696,6 +5692,10 @@ computeExtractCost(ArrayRef VL, FixedVectorType *VecTy, continue; } + // Need to exclude undefs from analysis. + if (isa(V) || Mask[Idx] == UndefMaskElem) + continue; + // Check all extracts for a vector register on the target directly // extract values in order. unsigned CurrentIdx = *getExtractIndex(cast(V)); @@ -6145,28 +6145,60 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, assert(E->ReuseShuffleIndices.empty() && "Unique insertelements only are expected."); auto *SrcVecTy = cast(VL0->getType()); - unsigned const NumElts = SrcVecTy->getNumElements(); unsigned const NumScalars = VL.size(); + + unsigned NumOfParts = TTI->getNumberOfParts(SrcVecTy); + + unsigned OffsetBeg = *getInsertIndex(VL.front()); + unsigned OffsetEnd = OffsetBeg; + for (Value *V : VL.drop_front()) { + unsigned Idx = *getInsertIndex(V); + if (OffsetBeg > Idx) + OffsetBeg = Idx; + else if (OffsetEnd < Idx) + OffsetEnd = Idx; + } + unsigned VecScalarsSz = PowerOf2Ceil(NumElts); + if (NumOfParts > 0) + VecScalarsSz = PowerOf2Ceil((NumElts + NumOfParts - 1) / NumOfParts); + unsigned VecSz = + (1 + OffsetEnd / VecScalarsSz - OffsetBeg / VecScalarsSz) * + VecScalarsSz; + unsigned Offset = VecScalarsSz * (OffsetBeg / VecScalarsSz); + unsigned InsertVecSz = std::min( + PowerOf2Ceil(OffsetEnd - OffsetBeg + 1), + ((OffsetEnd - OffsetBeg + VecScalarsSz) / VecScalarsSz) * + VecScalarsSz); + bool IsWholeSubvector = + OffsetBeg == Offset && ((OffsetEnd + 1) % VecScalarsSz == 0); + // Check if we can safely insert a subvector. If it is not possible, just + // generate a whole-sized vector and shuffle the source vector and the new + // subvector. + if (OffsetBeg + InsertVecSz > VecSz) { + // Align OffsetBeg to generate correct mask. + OffsetBeg = alignDown(OffsetBeg, VecSz, Offset); + InsertVecSz = VecSz; + } + APInt DemandedElts = APInt::getZero(NumElts); // TODO: Add support for Instruction::InsertValue. SmallVector Mask; if (!E->ReorderIndices.empty()) { inversePermutation(E->ReorderIndices, Mask); - Mask.append(NumElts - NumScalars, UndefMaskElem); + Mask.append(InsertVecSz - Mask.size(), UndefMaskElem); } else { - Mask.assign(NumElts, UndefMaskElem); - std::iota(Mask.begin(), std::next(Mask.begin(), NumScalars), 0); + Mask.assign(VecSz, UndefMaskElem); + std::iota(Mask.begin(), std::next(Mask.begin(), InsertVecSz), 0); } - unsigned Offset = *getInsertIndex(VL0); bool IsIdentity = true; - SmallVector PrevMask(NumElts, UndefMaskElem); + SmallVector PrevMask(InsertVecSz, UndefMaskElem); Mask.swap(PrevMask); for (unsigned I = 0; I < NumScalars; ++I) { unsigned InsertIdx = *getInsertIndex(VL[PrevMask[I]]); DemandedElts.setBit(InsertIdx); - IsIdentity &= InsertIdx - Offset == I; - Mask[InsertIdx - Offset] = I; + IsIdentity &= InsertIdx - OffsetBeg == I; + Mask[InsertIdx - OffsetBeg] = I; } assert(Offset < NumElts && "Failed to find vector index offset"); @@ -6174,32 +6206,41 @@ InstructionCost BoUpSLP::getEntryCost(const TreeEntry *E, Cost -= TTI->getScalarizationOverhead(SrcVecTy, DemandedElts, /*Insert*/ true, /*Extract*/ false); - if (IsIdentity && NumElts != NumScalars && Offset % NumScalars != 0) { - // FIXME: Replace with SK_InsertSubvector once it is properly supported. - unsigned Sz = PowerOf2Ceil(Offset + NumScalars); - Cost += TTI->getShuffleCost( - TargetTransformInfo::SK_PermuteSingleSrc, - FixedVectorType::get(SrcVecTy->getElementType(), Sz)); - } else if (!IsIdentity) { - auto *FirstInsert = - cast(*find_if(E->Scalars, [E](Value *V) { - return !is_contained(E->Scalars, - cast(V)->getOperand(0)); - })); - if (isUndefVector(FirstInsert->getOperand(0))) { - Cost += TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, SrcVecTy, Mask); + // First cost - resize to actual vector size if not identity shuffle or + // need to shift the vector. + // Do not calculate the cost if the actual size is the register size and + // we can merge this shuffle with the following SK_Select. + auto *InsertVecTy = + FixedVectorType::get(SrcVecTy->getElementType(), InsertVecSz); + if (!IsIdentity) + Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, + InsertVecTy, Mask); + auto *FirstInsert = cast(*find_if(E->Scalars, [E](Value *V) { + return !is_contained(E->Scalars, cast(V)->getOperand(0)); + })); + // Second cost - permutation with subvector, if some elements are from the + // initial vector or inserting a subvector. + // TODO: Implement the analysis of the FirstInsert->getOperand(0) + // subvector of ActualVecTy. + if (!isUndefVector(FirstInsert->getOperand(0)) && NumScalars != NumElts && + !IsWholeSubvector) { + if (InsertVecSz != VecSz) { + auto *ActualVecTy = + FixedVectorType::get(SrcVecTy->getElementType(), VecSz); + Cost += TTI->getShuffleCost(TTI::SK_InsertSubvector, ActualVecTy, + None, OffsetBeg - Offset, InsertVecTy); } else { - SmallVector InsertMask(NumElts); - std::iota(InsertMask.begin(), InsertMask.end(), 0); - for (unsigned I = 0; I < NumElts; I++) { + for (unsigned I = 0, End = OffsetBeg - Offset; I < End; ++I) + Mask[I] = I; + for (unsigned I = OffsetBeg - Offset, End = OffsetEnd - Offset; + I <= End; ++I) if (Mask[I] != UndefMaskElem) - InsertMask[Offset + I] = NumElts + I; - } - Cost += - TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVecTy, InsertMask); + Mask[I] = I + VecSz; + for (unsigned I = OffsetEnd + 1 - Offset; I < VecSz; ++I) + Mask[I] = I; + Cost += TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, InsertVecTy, Mask); } } - return Cost; } case Instruction::ZExt: @@ -6659,7 +6700,10 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const { // No need to vectorize inserts of gathered values. if (VectorizableTree.size() == 2 && isa(VectorizableTree[0]->Scalars[0]) && - VectorizableTree[1]->State == TreeEntry::NeedToGather) + VectorizableTree[1]->State == TreeEntry::NeedToGather && + (VectorizableTree[1]->getVectorFactor() <= 2 || + !(isSplat(VectorizableTree[1]->Scalars) || + allConstant(VectorizableTree[1]->Scalars)))) return true; // We can vectorize the tree if its size is greater than or equal to the @@ -7693,6 +7737,11 @@ Value *BoUpSLP::vectorizeTree(ArrayRef VL) { return createBuildVector(VL); } Value *BoUpSLP::createBuildVector(ArrayRef VL) { + assert(any_of(VectorizableTree, + [VL](const std::unique_ptr &TE) { + return TE->State == TreeEntry::NeedToGather && TE->isSame(VL); + }) && + "Non-matching gather node."); unsigned VF = VL.size(); // Exploit possible reuse of values across lanes. SmallVector ReuseShuffleIndicies; diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll index fcc70d9..72b724f 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions-inseltpoison.ll @@ -22,16 +22,16 @@ define <4 x float> @int_sin_4x(<4 x float>* %a) { ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]]) ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]]) +; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]]) -; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1 +; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP4]]) +; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, <4 x float>* %a, align 16 @@ -218,16 +218,16 @@ define <4 x float> @exp_4x(<4 x float>* %a) { ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @expf(float [[VECEXT]]) ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @expf(float [[VECEXT_1]]) +; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]]) -; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @expf(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1 +; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP4]]) +; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, <4 x float>* %a, align 16 @@ -301,16 +301,16 @@ define <4 x float> @log_4x(<4 x float>* %a) { ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @logf(float [[VECEXT]]) ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @logf(float [[VECEXT_1]]) +; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]]) -; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @logf(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1 +; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP4]]) +; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, <4 x float>* %a, align 16 @@ -477,16 +477,16 @@ define <4 x float> @sin_4x(<4 x float>* %a) { ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @sinf(float [[VECEXT]]) ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @sinf(float [[VECEXT_1]]) +; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]]) -; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @sinf(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1 +; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP4]]) +; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, <4 x float>* %a, align 16 @@ -519,16 +519,16 @@ define <4 x float> @cos_4x(<4 x float>* %a) { ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @cosf(float [[VECEXT]]) ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @cosf(float [[VECEXT_1]]) +; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]]) -; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @cosf(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1 +; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP4]]) +; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, <4 x float>* %a, align 16 @@ -1010,16 +1010,16 @@ define <4 x float> @int_cos_4x(<4 x float>* %a) { ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]]) ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]]) +; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]]) -; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1 +; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP4]]) +; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, <4 x float>* %a, align 16 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll index 77def6a..d988e8f 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/accelerate-vector-functions.ll @@ -22,16 +22,16 @@ define <4 x float> @int_sin_4x(<4 x float>* %a) { ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]]) ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]]) +; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]]) -; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1 +; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP4]]) +; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, <4 x float>* %a, align 16 @@ -218,16 +218,16 @@ define <4 x float> @exp_4x(<4 x float>* %a) { ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @expf(float [[VECEXT]]) ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @expf(float [[VECEXT_1]]) +; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]]) -; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @expf(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1 +; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP4]]) +; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, <4 x float>* %a, align 16 @@ -301,16 +301,16 @@ define <4 x float> @log_4x(<4 x float>* %a) { ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @logf(float [[VECEXT]]) ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @logf(float [[VECEXT_1]]) +; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]]) -; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @logf(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1 +; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP4]]) +; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, <4 x float>* %a, align 16 @@ -477,16 +477,16 @@ define <4 x float> @sin_4x(<4 x float>* %a) { ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @sinf(float [[VECEXT]]) ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @sinf(float [[VECEXT_1]]) +; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]]) -; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @sinf(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1 +; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP4]]) +; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, <4 x float>* %a, align 16 @@ -519,16 +519,16 @@ define <4 x float> @cos_4x(<4 x float>* %a) { ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @cosf(float [[VECEXT]]) ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @cosf(float [[VECEXT_1]]) +; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]]) -; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @cosf(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1 +; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP4]]) +; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, <4 x float>* %a, align 16 @@ -1010,16 +1010,16 @@ define <4 x float> @int_cos_4x(<4 x float>* %a) { ; NOACCELERATE-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]]) ; NOACCELERATE-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 ; NOACCELERATE-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 +; NOACCELERATE-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]]) +; NOACCELERATE-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1 ; NOACCELERATE-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 -; NOACCELERATE-NEXT: [[TMP2:%.*]] = insertelement <2 x float> poison, float [[VECEXT_1]], i32 0 -; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[VECEXT_2]], i32 1 -; NOACCELERATE-NEXT: [[TMP4:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP3]]) -; NOACCELERATE-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; NOACCELERATE-NEXT: [[VECINS_21:%.*]] = shufflevector <4 x float> [[VECINS]], <4 x float> [[TMP5]], <4 x i32> ; NOACCELERATE-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3 -; NOACCELERATE-NEXT: [[TMP6:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_3]]) -; NOACCELERATE-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_21]], float [[TMP6]], i32 3 -; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_3]] +; NOACCELERATE-NEXT: [[TMP3:%.*]] = insertelement <2 x float> poison, float [[VECEXT_2]], i32 0 +; NOACCELERATE-NEXT: [[TMP4:%.*]] = insertelement <2 x float> [[TMP3]], float [[VECEXT_3]], i32 1 +; NOACCELERATE-NEXT: [[TMP5:%.*]] = call fast <2 x float> @llvm.cos.v2f32(<2 x float> [[TMP4]]) +; NOACCELERATE-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> +; NOACCELERATE-NEXT: [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP6]], <4 x i32> +; NOACCELERATE-NEXT: ret <4 x float> [[VECINS_31]] ; entry: %0 = load <4 x float>, <4 x float>* %a, align 16 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/buildvector-vectorize.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/buildvector-vectorize.ll new file mode 100644 index 0000000..30658b6 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/buildvector-vectorize.ll @@ -0,0 +1,21 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -slp-vectorizer -S -mtriple=aarch64 < %s | FileCheck %s + +define void @test(ptr %p) { +; CHECK-LABEL: @test( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[INC:%.*]] = getelementptr inbounds i16, ptr [[P:%.*]], i64 1 +; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[INC]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[P]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i16> zeroinitializer, i16 [[TMP0]], i32 5 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i16> [[TMP2]], i16 [[TMP1]], i32 7 +; CHECK-NEXT: ret void +; +entry: + %inc = getelementptr inbounds i16, ptr %p, i64 1 + %0 = load i16, ptr %inc, align 4 + %1 = load i16, ptr %p, align 2 + %2 = insertelement <8 x i16> zeroinitializer, i16 %0, i32 5 + %3 = insertelement <8 x i16> %2, i16 %1, i32 7 + ret void +} diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll index e25fd9b..61acd959 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll @@ -19,22 +19,23 @@ define void @s116_modified(float* %a) { ; CHECK-LABEL: @s116_modified( ; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 0 ; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds float, float* [[A]], i64 1 -; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds float, float* [[A]], i64 3 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds float, float* [[A]], i64 2 +; CHECK-NEXT: [[GEP4:%.*]] = getelementptr inbounds float, float* [[A]], i64 4 +; CHECK-NEXT: [[LD1:%.*]] = load float, float* [[GEP1]], align 4 ; CHECK-NEXT: [[LD0:%.*]] = load float, float* [[GEP0]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[GEP1]] to <2 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[GEP2]] to <2 x float>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[GEP3]] to <2 x float>* -; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> poison, float [[LD0]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> [[TMP8]], <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP6]], <4 x float> [[TMP6]], <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x float> [[TMP10]], <4 x float> [[TMP8]], <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = fmul fast <4 x float> [[TMP9]], [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast float* [[GEP0]] to <4 x float>* -; CHECK-NEXT: store <4 x float> [[TMP12]], <4 x float>* [[TMP13]], align 4 +; CHECK-NEXT: [[LD4:%.*]] = load float, float* [[GEP4]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[LD0]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[LD4]], i32 3 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> poison, float [[LD1]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[LD1]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = fmul fast <4 x float> [[TMP6]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast float* [[GEP0]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP10]], <4 x float>* [[TMP11]], align 4 ; CHECK-NEXT: ret void ; %gep0 = getelementptr inbounds float, float* %a, i64 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll index 25ee637..1114078 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll @@ -6,95 +6,55 @@ define void @Test(i32) { ; CHECK-LABEL: @Test( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[TMP0:%.*]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[TMP0]], i32 2 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP0]], i32 3 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[TMP0]], i32 4 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[TMP0]], i32 5 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[TMP0]], i32 6 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[TMP0]], i32 7 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x i32> poison, i32 [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x i32> [[TMP9]], i32 [[TMP0]], i32 1 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x i32> [[TMP10]], i32 [[TMP0]], i32 2 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <16 x i32> [[TMP11]], i32 [[TMP0]], i32 3 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x i32> [[TMP12]], i32 [[TMP0]], i32 4 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <16 x i32> [[TMP13]], i32 [[TMP0]], i32 5 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x i32> [[TMP14]], i32 [[TMP0]], i32 6 -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <16 x i32> [[TMP15]], i32 [[TMP0]], i32 7 -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <16 x i32> [[TMP16]], i32 [[TMP0]], i32 8 -; CHECK-NEXT: [[TMP18:%.*]] = insertelement <16 x i32> [[TMP17]], i32 [[TMP0]], i32 9 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <16 x i32> [[TMP18]], i32 [[TMP0]], i32 10 -; CHECK-NEXT: [[TMP20:%.*]] = insertelement <16 x i32> [[TMP19]], i32 [[TMP0]], i32 11 -; CHECK-NEXT: [[TMP21:%.*]] = insertelement <16 x i32> [[TMP20]], i32 [[TMP0]], i32 12 -; CHECK-NEXT: [[TMP22:%.*]] = insertelement <16 x i32> [[TMP21]], i32 [[TMP0]], i32 13 -; CHECK-NEXT: [[TMP23:%.*]] = insertelement <16 x i32> [[TMP22]], i32 [[TMP0]], i32 14 -; CHECK-NEXT: [[TMP24:%.*]] = insertelement <16 x i32> [[TMP23]], i32 [[TMP0]], i32 15 +; CHECK-NEXT: [[SHUFFLE7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> poison, i32 [[TMP0]], i32 0 +; CHECK-NEXT: [[SHUFFLE6:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> poison, <16 x i32> zeroinitializer ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[TMP25:%.*]] = phi <2 x i32> [ [[TMP36:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP25]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP26:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1 -; CHECK-NEXT: [[TMP27:%.*]] = add <8 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP28:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP24]]) -; CHECK-NEXT: [[TMP29:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP8]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP28]], [[TMP29]] -; CHECK-NEXT: [[TMP30:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP27]]) -; CHECK-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[TMP30]] +; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x i32> [ [[TMP14:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = add <8 x i32> [[SHUFFLE]], +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[SHUFFLE6]]) +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[SHUFFLE7]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP5]]) +; CHECK-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[TMP8]] ; CHECK-NEXT: [[OP_RDX2:%.*]] = and i32 [[TMP0]], [[TMP0]] -; CHECK-NEXT: [[OP_RDX3:%.*]] = and i32 [[TMP0]], [[TMP26]] +; CHECK-NEXT: [[OP_RDX3:%.*]] = and i32 [[TMP0]], [[TMP4]] ; CHECK-NEXT: [[OP_RDX4:%.*]] = and i32 [[OP_RDX2]], [[OP_RDX3]] -; CHECK-NEXT: [[TMP31:%.*]] = insertelement <2 x i32> poison, i32 [[OP_RDX1]], i32 0 -; CHECK-NEXT: [[TMP32:%.*]] = insertelement <2 x i32> [[TMP31]], i32 [[TMP26]], i32 1 -; CHECK-NEXT: [[TMP33:%.*]] = insertelement <2 x i32> , i32 [[OP_RDX4]], i32 0 -; CHECK-NEXT: [[TMP34:%.*]] = and <2 x i32> [[TMP32]], [[TMP33]] -; CHECK-NEXT: [[TMP35:%.*]] = add <2 x i32> [[TMP32]], [[TMP33]] -; CHECK-NEXT: [[TMP36]] = shufflevector <2 x i32> [[TMP34]], <2 x i32> [[TMP35]], <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[OP_RDX1]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP4]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> , i32 [[OP_RDX4]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = and <2 x i32> [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = add <2 x i32> [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[TMP14]] = shufflevector <2 x i32> [[TMP12]], <2 x i32> [[TMP13]], <2 x i32> ; CHECK-NEXT: br label [[LOOP]] ; ; FORCE_REDUCTION-LABEL: @Test( ; FORCE_REDUCTION-NEXT: entry: ; FORCE_REDUCTION-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[TMP0:%.*]], i32 0 -; FORCE_REDUCTION-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[TMP0]], i32 1 -; FORCE_REDUCTION-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[TMP0]], i32 2 -; FORCE_REDUCTION-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP0]], i32 3 -; FORCE_REDUCTION-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[TMP0]], i32 4 -; FORCE_REDUCTION-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[TMP0]], i32 5 -; FORCE_REDUCTION-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[TMP0]], i32 6 -; FORCE_REDUCTION-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[TMP0]], i32 7 -; FORCE_REDUCTION-NEXT: [[TMP9:%.*]] = insertelement <16 x i32> poison, i32 [[TMP0]], i32 0 -; FORCE_REDUCTION-NEXT: [[TMP10:%.*]] = insertelement <16 x i32> [[TMP9]], i32 [[TMP0]], i32 1 -; FORCE_REDUCTION-NEXT: [[TMP11:%.*]] = insertelement <16 x i32> [[TMP10]], i32 [[TMP0]], i32 2 -; FORCE_REDUCTION-NEXT: [[TMP12:%.*]] = insertelement <16 x i32> [[TMP11]], i32 [[TMP0]], i32 3 -; FORCE_REDUCTION-NEXT: [[TMP13:%.*]] = insertelement <16 x i32> [[TMP12]], i32 [[TMP0]], i32 4 -; FORCE_REDUCTION-NEXT: [[TMP14:%.*]] = insertelement <16 x i32> [[TMP13]], i32 [[TMP0]], i32 5 -; FORCE_REDUCTION-NEXT: [[TMP15:%.*]] = insertelement <16 x i32> [[TMP14]], i32 [[TMP0]], i32 6 -; FORCE_REDUCTION-NEXT: [[TMP16:%.*]] = insertelement <16 x i32> [[TMP15]], i32 [[TMP0]], i32 7 -; FORCE_REDUCTION-NEXT: [[TMP17:%.*]] = insertelement <16 x i32> [[TMP16]], i32 [[TMP0]], i32 8 -; FORCE_REDUCTION-NEXT: [[TMP18:%.*]] = insertelement <16 x i32> [[TMP17]], i32 [[TMP0]], i32 9 -; FORCE_REDUCTION-NEXT: [[TMP19:%.*]] = insertelement <16 x i32> [[TMP18]], i32 [[TMP0]], i32 10 -; FORCE_REDUCTION-NEXT: [[TMP20:%.*]] = insertelement <16 x i32> [[TMP19]], i32 [[TMP0]], i32 11 -; FORCE_REDUCTION-NEXT: [[TMP21:%.*]] = insertelement <16 x i32> [[TMP20]], i32 [[TMP0]], i32 12 -; FORCE_REDUCTION-NEXT: [[TMP22:%.*]] = insertelement <16 x i32> [[TMP21]], i32 [[TMP0]], i32 13 -; FORCE_REDUCTION-NEXT: [[TMP23:%.*]] = insertelement <16 x i32> [[TMP22]], i32 [[TMP0]], i32 14 -; FORCE_REDUCTION-NEXT: [[TMP24:%.*]] = insertelement <16 x i32> [[TMP23]], i32 [[TMP0]], i32 15 +; FORCE_REDUCTION-NEXT: [[SHUFFLE7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer +; FORCE_REDUCTION-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> poison, i32 [[TMP0]], i32 0 +; FORCE_REDUCTION-NEXT: [[SHUFFLE6:%.*]] = shufflevector <16 x i32> [[TMP2]], <16 x i32> poison, <16 x i32> zeroinitializer ; FORCE_REDUCTION-NEXT: br label [[LOOP:%.*]] ; FORCE_REDUCTION: loop: -; FORCE_REDUCTION-NEXT: [[TMP25:%.*]] = phi <2 x i32> [ [[TMP32:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ] -; FORCE_REDUCTION-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP25]], <2 x i32> poison, <8 x i32> -; FORCE_REDUCTION-NEXT: [[TMP26:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1 -; FORCE_REDUCTION-NEXT: [[TMP27:%.*]] = add <8 x i32> [[SHUFFLE]], -; FORCE_REDUCTION-NEXT: [[TMP28:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP24]]) -; FORCE_REDUCTION-NEXT: [[TMP29:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP8]]) -; FORCE_REDUCTION-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP28]], [[TMP29]] -; FORCE_REDUCTION-NEXT: [[TMP30:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP27]]) -; FORCE_REDUCTION-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[TMP30]] +; FORCE_REDUCTION-NEXT: [[TMP3:%.*]] = phi <2 x i32> [ [[TMP10:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ] +; FORCE_REDUCTION-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <8 x i32> +; FORCE_REDUCTION-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1 +; FORCE_REDUCTION-NEXT: [[TMP5:%.*]] = add <8 x i32> [[SHUFFLE]], +; FORCE_REDUCTION-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[SHUFFLE6]]) +; FORCE_REDUCTION-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[SHUFFLE7]]) +; FORCE_REDUCTION-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP6]], [[TMP7]] +; FORCE_REDUCTION-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP5]]) +; FORCE_REDUCTION-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[TMP8]] ; FORCE_REDUCTION-NEXT: [[OP_RDX2:%.*]] = and i32 [[TMP0]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_RDX3:%.*]] = and i32 [[TMP0]], [[TMP26]] +; FORCE_REDUCTION-NEXT: [[OP_RDX3:%.*]] = and i32 [[TMP0]], [[TMP4]] ; FORCE_REDUCTION-NEXT: [[OP_RDX4:%.*]] = and i32 [[OP_RDX2]], [[OP_RDX3]] ; FORCE_REDUCTION-NEXT: [[OP_RDX5:%.*]] = and i32 [[OP_RDX1]], [[OP_RDX4]] -; FORCE_REDUCTION-NEXT: [[VAL_43:%.*]] = add i32 [[TMP26]], 14910 -; FORCE_REDUCTION-NEXT: [[TMP31:%.*]] = insertelement <2 x i32> poison, i32 [[OP_RDX5]], i32 0 -; FORCE_REDUCTION-NEXT: [[TMP32]] = insertelement <2 x i32> [[TMP31]], i32 [[VAL_43]], i32 1 +; FORCE_REDUCTION-NEXT: [[VAL_43:%.*]] = add i32 [[TMP4]], 14910 +; FORCE_REDUCTION-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> poison, i32 [[OP_RDX5]], i32 0 +; FORCE_REDUCTION-NEXT: [[TMP10]] = insertelement <2 x i32> [[TMP9]], i32 [[VAL_43]], i32 1 ; FORCE_REDUCTION-NEXT: br label [[LOOP]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll index e3d1f25..99b3173 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int-inseltpoison.ll @@ -230,21 +230,25 @@ define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) { define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) { ; SSE-LABEL: @ashr_lshr_shl_v8i32( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> undef, <4 x i32> +; SSE-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 6 +; SSE-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i64 7 +; SSE-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i64 6 +; SSE-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i64 7 +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> ; SSE-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] ; SSE-NEXT: [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]] ; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> ; SSE-NEXT: [[TMP6:%.*]] = lshr <8 x i32> [[A]], [[B]] ; SSE-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> poison, <2 x i32> -; SSE-NEXT: [[TMP8:%.*]] = shl <8 x i32> [[A]], [[B]] -; SSE-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <2 x i32> -; SSE-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> -; SSE-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <8 x i32> -; SSE-NEXT: [[R52:%.*]] = shufflevector <8 x i32> [[TMP10]], <8 x i32> [[TMP11]], <8 x i32> -; SSE-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <8 x i32> -; SSE-NEXT: [[R71:%.*]] = shufflevector <8 x i32> [[R52]], <8 x i32> [[TMP12]], <8 x i32> -; SSE-NEXT: ret <8 x i32> [[R71]] +; SSE-NEXT: [[AB6:%.*]] = shl i32 [[A6]], [[B6]] +; SSE-NEXT: [[AB7:%.*]] = shl i32 [[A7]], [[B7]] +; SSE-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> +; SSE-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <8 x i32> +; SSE-NEXT: [[R51:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> +; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R51]], i32 [[AB6]], i64 6 +; SSE-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i64 7 +; SSE-NEXT: ret <8 x i32> [[R7]] ; ; SLM-LABEL: @ashr_lshr_shl_v8i32( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll index 5f1fb55..a89bb20 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll @@ -230,21 +230,25 @@ define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) { define <8 x i32> @ashr_lshr_shl_v8i32(<8 x i32> %a, <8 x i32> %b) { ; SSE-LABEL: @ashr_lshr_shl_v8i32( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B:%.*]], <8 x i32> undef, <4 x i32> +; SSE-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 6 +; SSE-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i64 7 +; SSE-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i64 6 +; SSE-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i64 7 +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> ; SSE-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] ; SSE-NEXT: [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]] ; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> ; SSE-NEXT: [[TMP6:%.*]] = lshr <8 x i32> [[A]], [[B]] ; SSE-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> poison, <2 x i32> -; SSE-NEXT: [[TMP8:%.*]] = shl <8 x i32> [[A]], [[B]] -; SSE-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <2 x i32> -; SSE-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> -; SSE-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <8 x i32> -; SSE-NEXT: [[R52:%.*]] = shufflevector <8 x i32> [[TMP10]], <8 x i32> [[TMP11]], <8 x i32> -; SSE-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <8 x i32> -; SSE-NEXT: [[R71:%.*]] = shufflevector <8 x i32> [[R52]], <8 x i32> [[TMP12]], <8 x i32> -; SSE-NEXT: ret <8 x i32> [[R71]] +; SSE-NEXT: [[AB6:%.*]] = shl i32 [[A6]], [[B6]] +; SSE-NEXT: [[AB7:%.*]] = shl i32 [[A7]], [[B7]] +; SSE-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> +; SSE-NEXT: [[TMP9:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <8 x i32> +; SSE-NEXT: [[R51:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> +; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R51]], i32 [[AB6]], i64 6 +; SSE-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i64 7 +; SSE-NEXT: ret <8 x i32> [[R7]] ; ; SLM-LABEL: @ashr_lshr_shl_v8i32( ; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll index 6beff01..17e2105 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll @@ -52,15 +52,14 @@ entry: define <4 x float> @PR16739_byref(<4 x float>* nocapture readonly dereferenceable(16) %x) { ; CHECK-LABEL: @PR16739_byref( ; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X:%.*]], i64 0, i64 0 -; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X]], i64 0, i64 1 -; CHECK-NEXT: [[X0:%.*]] = load float, float* [[GEP0]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[GEP1]] to <2 x float>* +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X]], i64 0, i64 2 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[GEP0]] to <2 x float>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 -; CHECK-NEXT: [[I0:%.*]] = insertelement <4 x float> poison, float [[X0]], i32 0 +; CHECK-NEXT: [[X2:%.*]] = load float, float* [[GEP2]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[I21:%.*]] = shufflevector <4 x float> [[I0]], <4 x float> [[TMP3]], <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[I21]], <4 x float> [[TMP3]], <4 x i32> -; CHECK-NEXT: ret <4 x float> [[TMP4]] +; CHECK-NEXT: [[I2:%.*]] = insertelement <4 x float> [[TMP3]], float [[X2]], i32 2 +; CHECK-NEXT: [[I3:%.*]] = insertelement <4 x float> [[I2]], float [[X2]], i32 3 +; CHECK-NEXT: ret <4 x float> [[I3]] ; %gep0 = getelementptr inbounds <4 x float>, <4 x float>* %x, i64 0, i64 0 %gep1 = getelementptr inbounds <4 x float>, <4 x float>* %x, i64 0, i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll index dd08e79..a3cfa3b 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll @@ -52,15 +52,14 @@ entry: define <4 x float> @PR16739_byref(<4 x float>* nocapture readonly dereferenceable(16) %x) { ; CHECK-LABEL: @PR16739_byref( ; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X:%.*]], i64 0, i64 0 -; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X]], i64 0, i64 1 -; CHECK-NEXT: [[X0:%.*]] = load float, float* [[GEP0]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[GEP1]] to <2 x float>* +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X]], i64 0, i64 2 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[GEP0]] to <2 x float>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 -; CHECK-NEXT: [[I0:%.*]] = insertelement <4 x float> undef, float [[X0]], i32 0 +; CHECK-NEXT: [[X2:%.*]] = load float, float* [[GEP2]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[I21:%.*]] = shufflevector <4 x float> [[I0]], <4 x float> [[TMP3]], <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[I21]], <4 x float> [[TMP3]], <4 x i32> -; CHECK-NEXT: ret <4 x float> [[TMP4]] +; CHECK-NEXT: [[I2:%.*]] = insertelement <4 x float> [[TMP3]], float [[X2]], i32 2 +; CHECK-NEXT: [[I3:%.*]] = insertelement <4 x float> [[I2]], float [[X2]], i32 3 +; CHECK-NEXT: ret <4 x float> [[I3]] ; %gep0 = getelementptr inbounds <4 x float>, <4 x float>* %x, i64 0, i64 0 %gep1 = getelementptr inbounds <4 x float>, <4 x float>* %x, i64 0, i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/malformed_phis.ll b/llvm/test/Transforms/SLPVectorizer/X86/malformed_phis.ll index e5479a3..176a8a0 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/malformed_phis.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/malformed_phis.ll @@ -14,23 +14,9 @@ define void @test() #0 { ; CHECK-NEXT: [[TMP:%.*]] = phi i32 [ undef, [[BB1]] ], [ undef, [[BB:%.*]] ] ; CHECK-NEXT: [[TMP2:%.*]] = phi i32 [ [[OP_RDX:%.*]], [[BB1]] ], [ undef, [[BB]] ] ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <16 x i32> poison, i32 [[TMP]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i32> [[TMP0]], i32 [[TMP]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> [[TMP1]], i32 [[TMP]], i32 2 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x i32> [[TMP2]], i32 [[TMP]], i32 3 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x i32> [[TMP3]], i32 [[TMP]], i32 4 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i32> [[TMP4]], i32 [[TMP]], i32 5 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i32> [[TMP5]], i32 [[TMP]], i32 6 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x i32> [[TMP6]], i32 [[TMP]], i32 7 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[TMP]], i32 8 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[TMP]], i32 9 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x i32> [[TMP9]], i32 [[TMP]], i32 10 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x i32> [[TMP10]], i32 [[TMP]], i32 11 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <16 x i32> [[TMP11]], i32 [[TMP]], i32 12 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x i32> [[TMP12]], i32 [[TMP]], i32 13 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <16 x i32> [[TMP13]], i32 [[TMP]], i32 14 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x i32> [[TMP14]], i32 [[TMP]], i32 15 -; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> [[TMP15]]) -; CHECK-NEXT: [[OP_RDX]] = mul i32 [[TMP16]], undef +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> [[SHUFFLE]]) +; CHECK-NEXT: [[OP_RDX]] = mul i32 [[TMP1]], undef ; CHECK-NEXT: br label [[BB1]] ; bb: @@ -66,16 +52,10 @@ define void @test_2(i8 addrspace(1)* %arg, i32 %arg1) #0 { ; CHECK-NEXT: [[TMP:%.*]] = phi i32 [ undef, [[BB:%.*]] ], [ undef, [[BB2]] ] ; CHECK-NEXT: [[TMP3:%.*]] = phi i32 [ 0, [[BB]] ], [ undef, [[BB2]] ] ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> poison, i32 [[TMP]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> [[TMP0]], i32 [[TMP]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[TMP]], i32 2 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[TMP]], i32 3 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP]], i32 4 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[TMP]], i32 5 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[TMP]], i32 6 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[TMP]], i32 7 -; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef) -; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP7]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef) +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[SHUFFLE]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = add i32 [[TMP1]], [[TMP2]] ; CHECK-NEXT: [[OP_RDX1:%.*]] = add i32 [[OP_RDX]], undef ; CHECK-NEXT: call void @use(i32 [[OP_RDX1]]) ; CHECK-NEXT: br label [[BB2]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll index 2b8dcc3..c97c65f 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll @@ -12,28 +12,27 @@ define fastcc void @_ZN12_GLOBAL__N_127PolynomialMultiplyRecognize9recognizeEv() ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 0 ; CHECK-NEXT: [[SUB_I:%.*]] = add nsw i32 undef, -1 ; CHECK-NEXT: [[CONV31_I:%.*]] = and i32 undef, [[SUB_I]] -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[CONV31_I]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = lshr <8 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[CONV31_I]], i32 0 -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = lshr <4 x i32> [[SHUFFLE1]], -; CHECK-NEXT: [[SHR_12_I_I:%.*]] = lshr i32 [[CONV31_I]], 13 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[CONV31_I]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[CONV31_I]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = lshr <2 x i32> [[TMP6]], -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x i32> poison, i32 [[SUB_I]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <16 x i32> [[TMP8]], <16 x i32> [[TMP9]], <16 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <16 x i32> [[TMP10]], <16 x i32> [[TMP11]], <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x i32> [[TMP12]], i32 [[SHR_12_I_I]], i32 13 -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <16 x i32> [[TMP13]], <16 x i32> [[TMP14]], <16 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = trunc <16 x i32> [[TMP15]] to <16 x i8> -; CHECK-NEXT: [[TMP17:%.*]] = and <16 x i8> [[TMP16]], -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* -; CHECK-NEXT: store <16 x i8> [[TMP17]], <16 x i8>* [[TMP18]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[CONV31_I]], i32 0 +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = lshr <4 x i32> [[SHUFFLE1]], +; CHECK-NEXT: [[SHR_4_I_I:%.*]] = lshr i32 [[CONV31_I]], 5 +; CHECK-NEXT: [[SHR_5_I_I:%.*]] = lshr i32 [[CONV31_I]], 6 +; CHECK-NEXT: [[SHR_6_I_I:%.*]] = lshr i32 [[CONV31_I]], 7 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[CONV31_I]], i32 0 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = lshr <8 x i32> [[SHUFFLE]], +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i32> poison, i32 [[SUB_I]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> [[TMP6]], <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[SHR_4_I_I]], i32 5 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[SHR_5_I_I]], i32 6 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x i32> [[TMP9]], i32 [[SHR_6_I_I]], i32 7 +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <16 x i32> [[TMP10]], <16 x i32> [[TMP11]], <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = trunc <16 x i32> [[TMP12]] to <16 x i8> +; CHECK-NEXT: [[TMP14:%.*]] = and <16 x i8> [[TMP13]], +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* +; CHECK-NEXT: store <16 x i8> [[TMP14]], <16 x i8>* [[TMP15]], align 1 ; CHECK-NEXT: unreachable ; CHECK: if.end50.i: ; CHECK-NEXT: ret void -- 2.7.4