return false;
+/// Checks if the \p IE1 instructions is followed by \p IE2 instruction in the
+/// buildvector sequence.
+static bool isFirstInsertElement(const InsertElementInst *IE1,
+ const InsertElementInst *IE2) {
+ const auto *I1 = IE1;
+ const auto *I2 = IE2;
+ do {
+ if (I2 == IE1)
+ return true;
+ if (I1 == IE2)
+ return false;
+ if (I1)
+ I1 = dyn_cast<InsertElementInst>(I1->getOperand(0));
+ if (I2)
+ I2 = dyn_cast<InsertElementInst>(I2->getOperand(0));
+ } while (I1 || I2);
+ llvm_unreachable("Two different buildvectors not expected.");
+/// Does the analysis of the provided shuffle masks and performs the requested
+/// actions on the vectors with the given shuffle masks. It tries to do it in
+/// several steps.
+/// 1. If the Base vector is not undef vector, resizing the very first mask to
+/// have common VF and perform action for 2 input vectors (including non-undef
+/// Base). Other shuffle masks are combined with the resulting after the 1 stage
+/// and processed as a shuffle of 2 elements.
+/// 2. If the Base is undef vector and have only 1 shuffle mask, perform the
+/// action only for 1 vector with the given mask, if it is not the identity
+/// mask.
+/// 3. If > 2 masks are used, perform the remaining shuffle actions for 2
+/// vectors, combing the masks properly between the steps.
+template <typename T>
+static T *performExtractsShuffleAction(
+ MutableArrayRef<std::pair<T *, SmallVector<int>>> ShuffleMask, Value *Base,
+ function_ref<unsigned(T *)> GetVF,
+ function_ref<std::pair<T *, bool>(T *, ArrayRef<int>)> ResizeAction,
+ function_ref<T *(ArrayRef<int>, ArrayRef<T *>)> Action) {
+ assert(!ShuffleMask.empty() && "Empty list of shuffles for inserts.");
+ SmallVector<int> Mask(ShuffleMask.begin()->second);
+ auto VMIt = std::next(ShuffleMask.begin());
+ T *Prev = nullptr;
+ bool IsBaseNotUndef = !isUndefVector(Base);
+ if (IsBaseNotUndef) {
+ // Base is not undef, need to combine it with the next subvectors.
+ std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask);
+ for (unsigned Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
+ if (Mask[Idx] == UndefMaskElem)
+ Mask[Idx] = Idx;
+ else
+ Mask[Idx] = (Res.second ? Idx : Mask[Idx]) + VF;
+ }
+ Prev = Action(Mask, {nullptr, Res.first});
+ } else if (ShuffleMask.size() == 1) {
+ // Base is undef and only 1 vector is shuffled - perform the action only for
+ // single vector, if the mask is not the identity mask.
+ std::pair<T *, bool> Res = ResizeAction(ShuffleMask.begin()->first, Mask);
+ if (Res.second)
+ // Identity mask is found.
+ Prev = Res.first;
+ else
+ Prev = Action(Mask, {ShuffleMask.begin()->first});
+ } else {
+ // Base is undef and at least 2 input vectors shuffled - perform 2 vectors
+ // shuffles step by step, combining shuffle between the steps.
+ unsigned Vec1VF = GetVF(ShuffleMask.begin()->first);
+ unsigned Vec2VF = GetVF(VMIt->first);
+ if (Vec1VF == Vec2VF) {
+ // No need to resize the input vectors since they are of the same size, we
+ // can shuffle them directly.
+ ArrayRef<int> SecMask = VMIt->second;
+ for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
+ if (SecMask[I] != UndefMaskElem) {
+ assert(Mask[I] == UndefMaskElem && "Multiple uses of scalars.");
+ Mask[I] = SecMask[I] + Vec1VF;
+ }
+ }
+ Prev = Action(Mask, {ShuffleMask.begin()->first, VMIt->first});
+ } else {
+ // Vectors of different sizes - resize and reshuffle.
+ std::pair<T *, bool> Res1 =
+ ResizeAction(ShuffleMask.begin()->first, Mask);
+ std::pair<T *, bool> Res2 = ResizeAction(VMIt->first, VMIt->second);
+ ArrayRef<int> SecMask = VMIt->second;
+ for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
+ if (Mask[I] != UndefMaskElem) {
+ assert(SecMask[I] == UndefMaskElem && "Multiple uses of scalars.");
+ if (Res1.second)
+ Mask[I] = I;
+ } else if (SecMask[I] != UndefMaskElem) {
+ assert(Mask[I] == UndefMaskElem && "Multiple uses of scalars.");
+ Mask[I] = (Res2.second ? I : SecMask[I]) + VF;
+ }
+ }
+ Prev = Action(Mask, {Res1.first, Res2.first});
+ }
+ VMIt = std::next(VMIt);
+ }
+ // Perform requested actions for the remaining masks/vectors.
+ for (auto E = ShuffleMask.end(); VMIt != E; ++VMIt) {
+ // Shuffle other input vectors, if any.
+ std::pair<T *, bool> Res = ResizeAction(VMIt->first, VMIt->second);
+ ArrayRef<int> SecMask = VMIt->second;
+ for (unsigned I = 0, VF = Mask.size(); I < VF; ++I) {
+ if (SecMask[I] != UndefMaskElem) {
+ assert((Mask[I] == UndefMaskElem || IsBaseNotUndef) &&
+ "Multiple uses of scalars.");
+ Mask[I] = (Res.second ? I : SecMask[I]) + VF;
+ } else if (Mask[I] != UndefMaskElem) {
+ Mask[I] = I;
+ }
+ }
+ Prev = Action(Mask, {Prev, Res.first});
+ }
+ return Prev;
InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
InstructionCost Cost = 0;
LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
SmallPtrSet<Value *, 16> ExtractCostCalculated;
InstructionCost ExtractCost = 0;
- SmallVector<unsigned> VF;
- SmallVector<SmallVector<int>> ShuffleMask;
- SmallVector<Value *> FirstUsers;
+ SmallVector<MapVector<const TreeEntry *, SmallVector<int>>> ShuffleMasks;
+ SmallVector<std::pair<Value *, const TreeEntry *>> FirstUsers;
SmallVector<APInt> DemandedElts;
for (ExternalUser &EU : ExternalUses) {
// We only add extract cost once for the same scalar.
if (auto *FTy = dyn_cast<FixedVectorType>(VU->getType())) {
Optional<unsigned> InsertIdx = getInsertIndex(VU);
if (InsertIdx) {
- auto *It = find_if(FirstUsers, [VU](Value *V) {
- return areTwoInsertFromSameBuildVector(VU,
- cast<InsertElementInst>(V));
- });
+ const TreeEntry *ScalarTE = getTreeEntry(EU.Scalar);
+ auto *It =
+ find_if(FirstUsers,
+ [VU](const std::pair<Value *, const TreeEntry *> &Pair) {
+ return areTwoInsertFromSameBuildVector(
+ VU, cast<InsertElementInst>(Pair.first));
+ });
int VecId = -1;
if (It == FirstUsers.end()) {
- VF.push_back(FTy->getNumElements());
- ShuffleMask.emplace_back(VF.back(), UndefMaskElem);
+ (void)ShuffleMasks.emplace_back();
// Find the insertvector, vectorized in tree, if any.
Value *Base = VU;
while (auto *IEBase = dyn_cast<InsertElementInst>(Base)) {
VU = IEBase;
do {
int Idx = E->findLaneForValue(Base);
- ShuffleMask.back()[Idx] = Idx;
+ SmallVectorImpl<int> &Mask = ShuffleMasks.back()[ScalarTE];
+ if (Mask.empty())
+ Mask.assign(FTy->getNumElements(), UndefMaskElem);
+ Mask[Idx] = Idx;
Base = cast<InsertElementInst>(Base)->getOperand(0);
} while (E == getTreeEntry(Base));
Base = cast<InsertElementInst>(Base)->getOperand(0);
- FirstUsers.push_back(VU);
- DemandedElts.push_back(APInt::getZero(VF.back()));
+ FirstUsers.emplace_back(VU, ScalarTE);
+ DemandedElts.push_back(APInt::getZero(FTy->getNumElements()));
VecId = FirstUsers.size() - 1;
} else {
+ if (isFirstInsertElement(VU, cast<InsertElementInst>(It->first)))
+ It->first = VU;
VecId = std::distance(FirstUsers.begin(), It);
int InIdx = *InsertIdx;
- ShuffleMask[VecId][InIdx] = EU.Lane;
+ SmallVectorImpl<int> &Mask = ShuffleMasks[VecId][ScalarTE];
+ if (Mask.empty())
+ Mask.assign(FTy->getNumElements(), UndefMaskElem);
+ assert(Mask[InIdx] == UndefMaskElem &&
+ "InsertElementInstruction used already.");
+ Mask[InIdx] = EU.Lane;
InstructionCost SpillCost = getSpillCost();
Cost += SpillCost + ExtractCost;
- if (FirstUsers.size() == 1) {
- int Limit = ShuffleMask.front().size() * 2;
- if (!all_of(ShuffleMask.front(),
- [Limit](int Idx) { return Idx < Limit; }) ||
- !ShuffleVectorInst::isIdentityMask(ShuffleMask.front())) {
- InstructionCost C = TTI->getShuffleCost(
+ auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask) {
+ InstructionCost C = 0;
+ unsigned VF = Mask.size();
+ unsigned VecVF = TE->getVectorFactor();
+ if (VF != VecVF &&
+ (any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); }) ||
+ (all_of(Mask,
+ [VF](int Idx) { return Idx < 2 * static_cast<int>(VF); }) &&
+ !ShuffleVectorInst::isIdentityMask(Mask)))) {
+ SmallVector<int> OrigMask(VecVF, UndefMaskElem);
+ std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
+ OrigMask.begin());
+ C = TTI->getShuffleCost(
- cast<FixedVectorType>(FirstUsers.front()->getType()),
- ShuffleMask.front());
- LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
- << " for final shuffle of insertelement external users "
- << *VectorizableTree.front()->Scalars.front() << ".\n"
- << "SLP: Current total cost = " << Cost << "\n");
+ FixedVectorType::get(TE->getMainOp()->getType(), VecVF), OrigMask);
+ dbgs() << "SLP: Adding cost " << C
+ << " for final shuffle of insertelement external users.\n";
+ TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
Cost += C;
+ return std::make_pair(TE, true);
+ return std::make_pair(TE, false);
+ };
+ // Calculate the cost of the reshuffled vectors, if any.
+ for (int I = 0, E = FirstUsers.size(); I < E; ++I) {
+ Value *Base = cast<Instruction>(FirstUsers[I].first)->getOperand(0);
+ unsigned VF = ShuffleMasks[I].begin()->second.size();
+ auto *FTy = FixedVectorType::get(
+ cast<VectorType>(FirstUsers[I].first->getType())->getElementType(), VF);
+ auto Vector = ShuffleMasks[I].takeVector();
+ auto &&EstimateShufflesCost = [this, FTy,
+ &Cost](ArrayRef<int> Mask,
+ ArrayRef<const TreeEntry *> TEs) {
+ assert((TEs.size() == 1 || TEs.size() == 2) &&
+ "Expected exactly 1 or 2 tree entries.");
+ if (TEs.size() == 1) {
+ int Limit = 2 * Mask.size();
+ if (!all_of(Mask, [Limit](int Idx) { return Idx < Limit; }) ||
+ !ShuffleVectorInst::isIdentityMask(Mask)) {
+ InstructionCost C =
+ TTI->getShuffleCost(TTI::SK_PermuteSingleSrc, FTy, Mask);
+ LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
+ << " for final shuffle of insertelement "
+ "external users.\n";
+ TEs.front()->dump();
+ dbgs() << "SLP: Current total cost = " << Cost << "\n");
+ Cost += C;
+ }
+ } else {
+ InstructionCost C =
+ TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, FTy, Mask);
+ LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
+ << " for final shuffle of vector node and external "
+ "insertelement users.\n";
+ if (TEs.front()) { TEs.front()->dump(); } TEs.back()->dump();
+ dbgs() << "SLP: Current total cost = " << Cost << "\n");
+ Cost += C;
+ }
+ return TEs.back();
+ };
+ (void)performExtractsShuffleAction<const TreeEntry>(
+ makeMutableArrayRef(, Vector.size()), Base,
+ [](const TreeEntry *E) { return E->getVectorFactor(); }, ResizeToVF,
+ EstimateShufflesCost);
InstructionCost InsertCost = TTI->getScalarizationOverhead(
- cast<FixedVectorType>(FirstUsers.front()->getType()),
- DemandedElts.front(), /*Insert*/ true, /*Extract*/ false);
- LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost
- << " for insertelements gather.\n"
- << "SLP: Current total cost = " << Cost << "\n");
- Cost -= InsertCost;
- } else if (FirstUsers.size() >= 2) {
- unsigned MaxVF = *std::max_element(VF.begin(), VF.end());
- // Combined masks of the first 2 vectors.
- SmallVector<int> CombinedMask(MaxVF, UndefMaskElem);
- copy(ShuffleMask.front(), CombinedMask.begin());
- APInt CombinedDemandedElts = DemandedElts.front().zextOrSelf(MaxVF);
- auto *VecTy = FixedVectorType::get(
- cast<VectorType>(FirstUsers.front()->getType())->getElementType(),
- MaxVF);
- for (int I = 0, E = ShuffleMask[1].size(); I < E; ++I) {
- if (ShuffleMask[1][I] != UndefMaskElem) {
- CombinedMask[I] = ShuffleMask[1][I] + MaxVF;
- CombinedDemandedElts.setBit(I);
- }
- }
- InstructionCost C =
- TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, CombinedMask);
- LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
- << " for final shuffle of vector node and external "
- "insertelement users "
- << *VectorizableTree.front()->Scalars.front() << ".\n"
- << "SLP: Current total cost = " << Cost << "\n");
- Cost += C;
- InstructionCost InsertCost = TTI->getScalarizationOverhead(
- VecTy, CombinedDemandedElts, /*Insert*/ true, /*Extract*/ false);
- LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost
- << " for insertelements gather.\n"
- << "SLP: Current total cost = " << Cost << "\n");
+ cast<FixedVectorType>(FirstUsers[I].first->getType()), DemandedElts[I],
+ /*Insert*/ true, /*Extract*/ false);
Cost -= InsertCost;
- for (int I = 2, E = FirstUsers.size(); I < E; ++I) {
- if (ShuffleMask[I].empty())
- continue;
- // Other elements - permutation of 2 vectors (the initial one and the
- // next Ith incoming vector).
- unsigned VF = ShuffleMask[I].size();
- for (unsigned Idx = 0; Idx < VF; ++Idx) {
- int Mask = ShuffleMask[I][Idx];
- if (Mask != UndefMaskElem)
- CombinedMask[Idx] = MaxVF + Mask;
- else if (CombinedMask[Idx] != UndefMaskElem)
- CombinedMask[Idx] = Idx;
- }
- for (unsigned Idx = VF; Idx < MaxVF; ++Idx)
- if (CombinedMask[Idx] != UndefMaskElem)
- CombinedMask[Idx] = Idx;
- InstructionCost C =
- TTI->getShuffleCost(TTI::SK_PermuteTwoSrc, VecTy, CombinedMask);
- LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
- << " for final shuffle of vector node and external "
- "insertelement users "
- << *VectorizableTree.front()->Scalars.front() << ".\n"
- << "SLP: Current total cost = " << Cost << "\n");
- Cost += C;
- InstructionCost InsertCost = TTI->getScalarizationOverhead(
- cast<FixedVectorType>(FirstUsers[I]->getType()), DemandedElts[I],
- /*Insert*/ true, /*Extract*/ false);
- LLVM_DEBUG(dbgs() << "SLP: subtracting the cost " << InsertCost
- << " for insertelements gather.\n"
- << "SLP: Current total cost = " << Cost << "\n");
- Cost -= InsertCost;
- }
#ifndef NDEBUG
; CHECK-NEXT: [[TAB2:%.*]] = alloca [256 x i32], align 16
; CHECK-NEXT: br label [[BB1:%.*]]
; CHECK: bb1:
-; CHECK-NEXT: [[MUL19:%.*]] = fmul double [[P1:%.*]], 1.638400e+04
; CHECK-NEXT: [[MUL20:%.*]] = fmul double [[P3:%.*]], 1.638400e+04
; CHECK-NEXT: [[ADD:%.*]] = fadd double [[MUL20]], 8.192000e+03
-; CHECK-NEXT: [[MUL21:%.*]] = fmul double [[P2:%.*]], 1.638400e+04
+; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[P1:%.*]], i32 0
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[P2:%.*]], i32 1
+; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], <double 1.638400e+04, double 1.638400e+04>
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> <double 0.000000e+00, double poison>, double [[ADD]], i32 1
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[INDVARS_IV266:%.*]] = phi i64 [ 0, [[BB1]] ], [ [[INDVARS_IV_NEXT267:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[T_0259:%.*]] = phi double [ 0.000000e+00, [[BB1]] ], [ [[ADD27:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[P3_ADDR_0258:%.*]] = phi double [ [[ADD]], [[BB1]] ], [ [[ADD28:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[VECINIT_I_I237:%.*]] = insertelement <2 x double> poison, double [[T_0259]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x double> [ [[TMP3]], [[BB1]] ], [ [[TMP7:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP4]], i32 0
+; CHECK-NEXT: [[VECINIT_I_I237:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i32 0
; CHECK-NEXT: [[X13:%.*]] = tail call i32 @_xfn(<2 x double> [[VECINIT_I_I237]])
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [256 x i32], [256 x i32]* [[TAB1]], i64 0, i64 [[INDVARS_IV266]]
; CHECK-NEXT: store i32 [[X13]], i32* [[ARRAYIDX]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; CHECK-NEXT: [[VECINIT_I_I:%.*]] = insertelement <2 x double> poison, double [[P3_ADDR_0258]], i32 0
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
+; CHECK-NEXT: [[VECINIT_I_I:%.*]] = insertelement <2 x double> poison, double [[TMP6]], i32 0
; CHECK-NEXT: [[X14:%.*]] = tail call i32 @_xfn(<2 x double> [[VECINIT_I_I]])
; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds [256 x i32], [256 x i32]* [[TAB2]], i64 0, i64 [[INDVARS_IV266]]
; CHECK-NEXT: store i32 [[X14]], i32* [[ARRAYIDX26]], align 4, !tbaa [[TBAA0]]
-; CHECK-NEXT: [[ADD27]] = fadd double [[MUL19]], [[T_0259]]
-; CHECK-NEXT: [[ADD28]] = fadd double [[MUL21]], [[P3_ADDR_0258]]
+; CHECK-NEXT: [[TMP7]] = fadd <2 x double> [[TMP2]], [[TMP4]]
; CHECK-NEXT: [[INDVARS_IV_NEXT267]] = add nuw nsw i64 [[INDVARS_IV266]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT267]], 256
; CHECK-NEXT: br i1 [[EXITCOND]], label [[RETURN:%.*]], label [[FOR_BODY]]
; CHECK-NEXT: [[TAB2:%.*]] = alloca [256 x i32], align 16
; CHECK-NEXT: br label [[BB1:%.*]]
; CHECK: bb1:
-; CHECK-NEXT: [[MUL19:%.*]] = fmul double [[P1:%.*]], 1.638400e+04
; CHECK-NEXT: [[MUL20:%.*]] = fmul double [[P3:%.*]], 1.638400e+04
; CHECK-NEXT: [[ADD:%.*]] = fadd double [[MUL20]], 8.192000e+03
-; CHECK-NEXT: [[MUL21:%.*]] = fmul double [[P2:%.*]], 1.638400e+04
+; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[P1:%.*]], i32 0
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[P2:%.*]], i32 1
+; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], <double 1.638400e+04, double 1.638400e+04>
+; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> <double 0.000000e+00, double poison>, double [[ADD]], i32 1
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[INDVARS_IV266:%.*]] = phi i64 [ 0, [[BB1]] ], [ [[INDVARS_IV_NEXT267:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[T_0259:%.*]] = phi double [ 0.000000e+00, [[BB1]] ], [ [[ADD27:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[P3_ADDR_0258:%.*]] = phi double [ [[ADD]], [[BB1]] ], [ [[ADD28:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT: [[VECINIT_I_I237:%.*]] = insertelement <2 x double> undef, double [[T_0259]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x double> [ [[TMP3]], [[BB1]] ], [ [[TMP7:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP4]], i32 0
+; CHECK-NEXT: [[VECINIT_I_I237:%.*]] = insertelement <2 x double> undef, double [[TMP5]], i32 0
; CHECK-NEXT: [[X13:%.*]] = tail call i32 @_xfn(<2 x double> [[VECINIT_I_I237]])
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [256 x i32], [256 x i32]* [[TAB1]], i64 0, i64 [[INDVARS_IV266]]
; CHECK-NEXT: store i32 [[X13]], i32* [[ARRAYIDX]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; CHECK-NEXT: [[VECINIT_I_I:%.*]] = insertelement <2 x double> undef, double [[P3_ADDR_0258]], i32 0
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP4]], i32 1
+; CHECK-NEXT: [[VECINIT_I_I:%.*]] = insertelement <2 x double> undef, double [[TMP6]], i32 0
; CHECK-NEXT: [[X14:%.*]] = tail call i32 @_xfn(<2 x double> [[VECINIT_I_I]])
; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds [256 x i32], [256 x i32]* [[TAB2]], i64 0, i64 [[INDVARS_IV266]]
; CHECK-NEXT: store i32 [[X14]], i32* [[ARRAYIDX26]], align 4, !tbaa [[TBAA0]]
-; CHECK-NEXT: [[ADD27]] = fadd double [[MUL19]], [[T_0259]]
-; CHECK-NEXT: [[ADD28]] = fadd double [[MUL21]], [[P3_ADDR_0258]]
+; CHECK-NEXT: [[TMP7]] = fadd <2 x double> [[TMP2]], [[TMP4]]
; CHECK-NEXT: [[INDVARS_IV_NEXT267]] = add nuw nsw i64 [[INDVARS_IV266]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT267]], 256
; CHECK-NEXT: br i1 [[EXITCOND]], label [[RETURN:%.*]], label [[FOR_BODY]]
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[BODY:%.*]]
; CHECK: body:
-; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x double> [ zeroinitializer, [[ENTRY:%.*]] ], [ zeroinitializer, [[BODY]] ]
-; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[TMP0]], i32 1
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> <double undef, double poison>, double [[TMP1]], i32 1
-; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <2 x double> [[TMP2]], zeroinitializer
-; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x double> [[TMP3]], i32 0
-; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP3]], i32 1
-; CHECK-NEXT: [[ADD8_I_I:%.*]] = fadd fast double [[TMP5]], [[TMP4]]
+; CHECK-NEXT: [[PHI1:%.*]] = phi double [ 0.000000e+00, [[ENTRY:%.*]] ], [ 0.000000e+00, [[BODY]] ]
+; CHECK-NEXT: [[PHI2:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ 0.000000e+00, [[BODY]] ]
+; CHECK-NEXT: [[MUL_I478_I:%.*]] = fmul fast double [[PHI1]], 0.000000e+00
+; CHECK-NEXT: [[MUL7_I485_I:%.*]] = fmul fast double undef, 0.000000e+00
+; CHECK-NEXT: [[ADD8_I_I:%.*]] = fadd fast double [[MUL_I478_I]], [[MUL7_I485_I]]
; CHECK-NEXT: [[CMP42_I:%.*]] = fcmp fast ole double [[ADD8_I_I]], 0.000000e+00
; CHECK-NEXT: br i1 false, label [[BODY]], label [[EXIT:%.*]]
; CHECK: exit:
; CHECK-NEXT: br i1 false, label [[IF_THEN135_I:%.*]], label [[IF_END209_I:%.*]]
; CHECK: if.then135.i:
-; CHECK-NEXT: [[TMP6:%.*]] = fcmp fast olt <2 x double> [[TMP0]], zeroinitializer
-; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0
-; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i1> <i1 poison, i1 false>, i1 [[TMP7]], i32 0
-; CHECK-NEXT: [[TMP9:%.*]] = select <2 x i1> [[TMP8]], <2 x double> zeroinitializer, <2 x double> zeroinitializer
-; CHECK-NEXT: [[TMP10:%.*]] = fmul fast <2 x double> zeroinitializer, [[TMP9]]
-; CHECK-NEXT: [[TMP11:%.*]] = fmul fast <2 x double> [[TMP10]], zeroinitializer
-; CHECK-NEXT: [[TMP12:%.*]] = fadd fast <2 x double> [[TMP11]], zeroinitializer
+; CHECK-NEXT: [[CMP145_I:%.*]] = fcmp fast olt double [[PHI1]], 0.000000e+00
+; CHECK-NEXT: [[CMP152_I:%.*]] = fcmp fast olt double [[PHI2]], 0.000000e+00
+; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i1> <i1 poison, i1 false>, i1 [[CMP152_I]], i32 0
+; CHECK-NEXT: [[TMP1:%.*]] = select <2 x i1> [[TMP0]], <2 x double> zeroinitializer, <2 x double> zeroinitializer
+; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <2 x double> zeroinitializer, [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = fmul fast <2 x double> [[TMP2]], zeroinitializer
+; CHECK-NEXT: [[TMP4:%.*]] = fadd fast <2 x double> [[TMP3]], zeroinitializer
; CHECK-NEXT: br label [[IF_END209_I]]
; CHECK: if.end209.i:
-; CHECK-NEXT: [[TMP13:%.*]] = phi <2 x double> [ [[TMP12]], [[IF_THEN135_I]] ], [ zeroinitializer, [[EXIT]] ]
+; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x double> [ [[TMP4]], [[IF_THEN135_I]] ], [ zeroinitializer, [[EXIT]] ]
; CHECK-NEXT: ret void