From 07ef9f513f247f6ceb14d72b8218401e070d450d Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Mon, 25 Oct 2021 07:32:35 -0700 Subject: [PATCH] [SLP]Improve/fix reordering of the gathered graph nodes. Gathered loads/extractelements/extractvalue instructions should be checked if they can represent a vector reordering node too and their order should ve taken into account for better graph reordering analysis/ Also, if the gather node has reused scalars, they must be reordered instead of the scalars themselves. Differential Revision: https://reviews.llvm.org/D112454 --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 199 +++++++++++++++------ .../AArch64/transpose-inseltpoison.ll | 28 ++- .../Transforms/SLPVectorizer/AArch64/transpose.ll | 28 ++- .../test/Transforms/SLPVectorizer/X86/lookahead.ll | 23 ++- 4 files changed, 185 insertions(+), 93 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 63bc1fa..ec0f111 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -766,6 +766,12 @@ public: /// Perform LICM and CSE on the newly generated gather sequences. void optimizeGatherSequence(); + /// Checks if the specified gather tree entry \p TE can be represented as a + /// shuffled vector entry + (possibly) permutation with other gathers. It + /// implements the checks only for possibly ordered scalars (Loads, + /// ExtractElement, ExtractValue), which can be part of the graph. + Optional findReusedOrderedScalars(const TreeEntry &TE); + /// Reorders the current graph to the most profitable order starting from the /// root node to the leaf nodes. The best order is chosen only from the nodes /// of the same size (vectorization factor). Smaller nodes are considered @@ -2670,6 +2676,72 @@ static void reorderOrder(SmallVectorImpl &Order, ArrayRef Mask) { fixupOrderingIndices(Order); } +Optional +BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE) { + assert(TE.State == TreeEntry::NeedToGather && "Expected gather node only."); + unsigned NumScalars = TE.Scalars.size(); + OrdersType CurrentOrder(NumScalars, NumScalars); + SmallVector Positions; + SmallBitVector UsedPositions(NumScalars); + const TreeEntry *STE = nullptr; + // Try to find all gathered scalars that are gets vectorized in other + // vectorize node. Here we can have only one single tree vector node to + // correctly identify order of the gathered scalars. + for (unsigned I = 0; I < NumScalars; ++I) { + Value *V = TE.Scalars[I]; + if (!isa(V)) + continue; + if (const auto *LocalSTE = getTreeEntry(V)) { + if (!STE) + STE = LocalSTE; + else if (STE != LocalSTE) + // Take the order only from the single vector node. + return None; + unsigned Lane = + std::distance(STE->Scalars.begin(), find(STE->Scalars, V)); + if (Lane >= NumScalars) + return None; + if (CurrentOrder[Lane] != NumScalars) { + if (Lane != I) + continue; + UsedPositions.reset(CurrentOrder[Lane]); + } + // The partial identity (where only some elements of the gather node are + // in the identity order) is good. + CurrentOrder[Lane] = I; + UsedPositions.set(I); + } + } + // Need to keep the order if we have a vector entry and at least 2 scalars or + // the vectorized entry has just 2 scalars. + if (STE && (UsedPositions.count() > 1 || STE->Scalars.size() == 2)) { + auto &&IsIdentityOrder = [NumScalars](ArrayRef CurrentOrder) { + for (unsigned I = 0; I < NumScalars; ++I) + if (CurrentOrder[I] != I && CurrentOrder[I] != NumScalars) + return false; + return true; + }; + if (IsIdentityOrder(CurrentOrder)) { + CurrentOrder.clear(); + return CurrentOrder; + } + auto *It = CurrentOrder.begin(); + for (unsigned I = 0; I < NumScalars;) { + if (UsedPositions.test(I)) { + ++I; + continue; + } + if (*It == NumScalars) { + *It = I; + ++I; + } + ++It; + } + return CurrentOrder; + } + return None; +} + void BoUpSLP::reorderTopToBottom() { // Maps VF to the graph nodes. DenseMap> VFToOrderedEntries; @@ -2689,19 +2761,29 @@ void BoUpSLP::reorderTopToBottom() { InsertElementInst>(TE->getMainOp()) && !TE->isAltShuffle()) { VFToOrderedEntries[TE->Scalars.size()].insert(TE.get()); - } else if (TE->State == TreeEntry::NeedToGather && - TE->getOpcode() == Instruction::ExtractElement && - !TE->isAltShuffle() && - isa(cast(TE->getMainOp()) - ->getVectorOperandType()) && - allSameType(TE->Scalars) && allSameBlock(TE->Scalars)) { - // Check that gather of extractelements can be represented as - // just a shuffle of a single vector. - OrdersType CurrentOrder; - bool Reuse = canReuseExtract(TE->Scalars, TE->getMainOp(), CurrentOrder); - if (Reuse || !CurrentOrder.empty()) { + return; + } + if (TE->State == TreeEntry::NeedToGather) { + if (TE->getOpcode() == Instruction::ExtractElement && + !TE->isAltShuffle() && + isa(cast(TE->getMainOp()) + ->getVectorOperandType()) && + allSameType(TE->Scalars) && allSameBlock(TE->Scalars)) { + // Check that gather of extractelements can be represented as + // just a shuffle of a single vector. + OrdersType CurrentOrder; + bool Reuse = + canReuseExtract(TE->Scalars, TE->getMainOp(), CurrentOrder); + if (Reuse || !CurrentOrder.empty()) { + VFToOrderedEntries[TE->Scalars.size()].insert(TE.get()); + GathersToOrders.try_emplace(TE.get(), CurrentOrder); + return; + } + } + if (Optional CurrentOrder = + findReusedOrderedScalars(*TE.get())) { VFToOrderedEntries[TE->Scalars.size()].insert(TE.get()); - GathersToOrders.try_emplace(TE.get(), CurrentOrder); + GathersToOrders.try_emplace(TE.get(), *CurrentOrder); } } }); @@ -2718,7 +2800,9 @@ void BoUpSLP::reorderTopToBottom() { const SmallPtrSetImpl &OrderedEntries = It->getSecond(); // All operands are reordered and used only in this node - propagate the // most used order to the user node. - DenseMap OrdersUses; + MapVector> + OrdersUses; SmallPtrSet VisitedOps; for (const TreeEntry *OpTE : OrderedEntries) { // No need to reorder this nodes, still need to extend and to use shuffle, @@ -2742,18 +2826,18 @@ void BoUpSLP::reorderTopToBottom() { return Idx == UndefMaskElem ? E : static_cast(Idx); }); fixupOrderingIndices(CurrentOrder); - ++OrdersUses.try_emplace(CurrentOrder).first->getSecond(); + ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second; } else { - ++OrdersUses.try_emplace(Order).first->getSecond(); + ++OrdersUses.insert(std::make_pair(Order, 0)).first->second; } } // Set order of the user node. if (OrdersUses.empty()) continue; // Choose the most used order. - ArrayRef BestOrder = OrdersUses.begin()->first; - unsigned Cnt = OrdersUses.begin()->second; - for (const auto &Pair : llvm::drop_begin(OrdersUses)) { + ArrayRef BestOrder = OrdersUses.front().first; + unsigned Cnt = OrdersUses.front().second; + for (const auto &Pair : drop_begin(OrdersUses)) { if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) { BestOrder = Pair.first; Cnt = Pair.second; @@ -2830,6 +2914,8 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { for_each(VectorizableTree, [this, &OrderedEntries, &GathersToOrders, &NonVectorized]( const std::unique_ptr &TE) { + if (TE->State != TreeEntry::Vectorize) + NonVectorized.push_back(TE.get()); // No need to reorder if need to shuffle reuses, still need to shuffle the // node. if (!TE->ReuseShuffleIndices.empty()) @@ -2838,28 +2924,37 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { isa(TE->getMainOp()) && !TE->isAltShuffle()) { OrderedEntries.insert(TE.get()); - } else if (TE->State == TreeEntry::NeedToGather && - TE->getOpcode() == Instruction::ExtractElement && - !TE->isAltShuffle() && - isa(cast(TE->getMainOp()) - ->getVectorOperandType()) && - allSameType(TE->Scalars) && allSameBlock(TE->Scalars)) { - // Check that gather of extractelements can be represented as - // just a shuffle of a single vector with a single user only. - OrdersType CurrentOrder; - bool Reuse = canReuseExtract(TE->Scalars, TE->getMainOp(), CurrentOrder); - if ((Reuse || !CurrentOrder.empty()) && - !any_of( - VectorizableTree, [&TE](const std::unique_ptr &Entry) { - return Entry->State == TreeEntry::NeedToGather && - Entry.get() != TE.get() && Entry->isSame(TE->Scalars); - })) { + return; + } + if (TE->State == TreeEntry::NeedToGather) { + if (TE->getOpcode() == Instruction::ExtractElement && + !TE->isAltShuffle() && + isa(cast(TE->getMainOp()) + ->getVectorOperandType()) && + allSameType(TE->Scalars) && allSameBlock(TE->Scalars)) { + // Check that gather of extractelements can be represented as + // just a shuffle of a single vector with a single user only. + OrdersType CurrentOrder; + bool Reuse = + canReuseExtract(TE->Scalars, TE->getMainOp(), CurrentOrder); + if ((Reuse || !CurrentOrder.empty()) && + !any_of(VectorizableTree, + [&TE](const std::unique_ptr &Entry) { + return Entry->State == TreeEntry::NeedToGather && + Entry.get() != TE.get() && + Entry->isSame(TE->Scalars); + })) { + OrderedEntries.insert(TE.get()); + GathersToOrders.try_emplace(TE.get(), CurrentOrder); + return; + } + } + if (Optional CurrentOrder = + findReusedOrderedScalars(*TE.get())) { OrderedEntries.insert(TE.get()); - GathersToOrders.try_emplace(TE.get(), CurrentOrder); + GathersToOrders.try_emplace(TE.get(), *CurrentOrder); } } - if (TE->State != TreeEntry::Vectorize) - NonVectorized.push_back(TE.get()); }); // Checks if the operands of the users are reordarable and have only single @@ -2911,7 +3006,7 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { for (TreeEntry *TE : OrderedEntries) { if (!(TE->State == TreeEntry::Vectorize || (TE->State == TreeEntry::NeedToGather && - TE->getOpcode() == Instruction::ExtractElement)) || + GathersToOrders.count(TE))) || TE->UserTreeIndices.empty() || !TE->ReuseShuffleIndices.empty() || !all_of(drop_begin(TE->UserTreeIndices), [TE](const EdgeInfo &EI) { @@ -2946,7 +3041,9 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { } // All operands are reordered and used only in this node - propagate the // most used order to the user node. - DenseMap OrdersUses; + MapVector> + OrdersUses; SmallPtrSet VisitedOps; for (const auto &Op : Data.second) { TreeEntry *OpTE = Op.second; @@ -2969,13 +3066,14 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { return Idx == UndefMaskElem ? E : static_cast(Idx); }); fixupOrderingIndices(CurrentOrder); - ++OrdersUses.try_emplace(CurrentOrder).first->getSecond(); + ++OrdersUses.insert(std::make_pair(CurrentOrder, 0)).first->second; } else { - ++OrdersUses.try_emplace(Order).first->getSecond(); + ++OrdersUses.insert(std::make_pair(Order, 0)).first->second; } if (VisitedOps.insert(OpTE).second) - OrdersUses.try_emplace({}, 0).first->getSecond() += + OrdersUses.insert(std::make_pair(OrdersType(), 0)).first->second += OpTE->UserTreeIndices.size(); + assert(OrdersUses[{}] > 0 && "Counter cannot be less than 0."); --OrdersUses[{}]; } // If no orders - skip current nodes and jump to the next one, if any. @@ -2987,9 +3085,9 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { continue; } // Choose the best order. - ArrayRef BestOrder = OrdersUses.begin()->first; - unsigned Cnt = OrdersUses.begin()->second; - for (const auto &Pair : llvm::drop_begin(OrdersUses)) { + ArrayRef BestOrder = OrdersUses.front().first; + unsigned Cnt = OrdersUses.front().second; + for (const auto &Pair : drop_begin(OrdersUses)) { if (Cnt < Pair.second || (Cnt == Pair.second && Pair.first.empty())) { BestOrder = Pair.first; Cnt = Pair.second; @@ -3032,10 +3130,13 @@ void BoUpSLP::reorderBottomToTop(bool IgnoreReorder) { } // For gathers just need to reorder its scalars. for (TreeEntry *Gather : GatherOps) { - if (!Gather->ReuseShuffleIndices.empty()) - continue; assert(Gather->ReorderIndices.empty() && "Unexpected reordering of gathers."); + if (!Gather->ReuseShuffleIndices.empty()) { + // Just reorder reuses indices. + reorderReuses(Gather->ReuseShuffleIndices, Mask); + continue; + } reorderScalars(Gather->Scalars, Mask); OrderedEntries.remove(Gather); } @@ -7369,9 +7470,7 @@ struct SLPVectorizer : public FunctionPass { initializeSLPVectorizerPass(*PassRegistry::getPassRegistry()); } - bool doInitialization(Module &M) override { - return false; - } + bool doInitialization(Module &M) override { return false; } bool runOnFunction(Function &F) override { if (skipFunction(F)) diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll index 96b143c..16fd83f 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll @@ -32,21 +32,19 @@ define <2 x i64> @build_vec_v2i64(<2 x i64> %v0, <2 x i64> %v1) { define void @store_chain_v2i64(i64* %a, i64* %b, i64* %c) { ; CHECK-LABEL: @store_chain_v2i64( -; CHECK-NEXT: [[A_1:%.*]] = getelementptr i64, i64* [[A:%.*]], i64 1 -; CHECK-NEXT: [[B_1:%.*]] = getelementptr i64, i64* [[B:%.*]], i64 1 -; CHECK-NEXT: [[C_1:%.*]] = getelementptr i64, i64* [[C:%.*]], i64 1 -; CHECK-NEXT: [[V0_0:%.*]] = load i64, i64* [[A]], align 8 -; CHECK-NEXT: [[V0_1:%.*]] = load i64, i64* [[A_1]], align 8 -; CHECK-NEXT: [[V1_0:%.*]] = load i64, i64* [[B]], align 8 -; CHECK-NEXT: [[V1_1:%.*]] = load i64, i64* [[B_1]], align 8 -; CHECK-NEXT: [[TMP0_0:%.*]] = add i64 [[V0_0]], [[V1_0]] -; CHECK-NEXT: [[TMP0_1:%.*]] = add i64 [[V0_1]], [[V1_1]] -; CHECK-NEXT: [[TMP1_0:%.*]] = sub i64 [[V0_0]], [[V1_0]] -; CHECK-NEXT: [[TMP1_1:%.*]] = sub i64 [[V0_1]], [[V1_1]] -; CHECK-NEXT: [[TMP2_0:%.*]] = add i64 [[TMP0_0]], [[TMP0_1]] -; CHECK-NEXT: [[TMP2_1:%.*]] = add i64 [[TMP1_0]], [[TMP1_1]] -; CHECK-NEXT: store i64 [[TMP2_0]], i64* [[C]], align 8 -; CHECK-NEXT: store i64 [[TMP2_1]], i64* [[C_1]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[A:%.*]] to <2 x i64>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[B:%.*]] to <2 x i64>* +; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP9]], <2 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = add <2 x i64> [[TMP10]], [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i64* [[C:%.*]] to <2 x i64>* +; CHECK-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* [[TMP12]], align 8 ; CHECK-NEXT: ret void ; %a.0 = getelementptr i64, i64* %a, i64 0 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll index 34b32f2..b4ed860 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll @@ -32,21 +32,19 @@ define <2 x i64> @build_vec_v2i64(<2 x i64> %v0, <2 x i64> %v1) { define void @store_chain_v2i64(i64* %a, i64* %b, i64* %c) { ; CHECK-LABEL: @store_chain_v2i64( -; CHECK-NEXT: [[A_1:%.*]] = getelementptr i64, i64* [[A:%.*]], i64 1 -; CHECK-NEXT: [[B_1:%.*]] = getelementptr i64, i64* [[B:%.*]], i64 1 -; CHECK-NEXT: [[C_1:%.*]] = getelementptr i64, i64* [[C:%.*]], i64 1 -; CHECK-NEXT: [[V0_0:%.*]] = load i64, i64* [[A]], align 8 -; CHECK-NEXT: [[V0_1:%.*]] = load i64, i64* [[A_1]], align 8 -; CHECK-NEXT: [[V1_0:%.*]] = load i64, i64* [[B]], align 8 -; CHECK-NEXT: [[V1_1:%.*]] = load i64, i64* [[B_1]], align 8 -; CHECK-NEXT: [[TMP0_0:%.*]] = add i64 [[V0_0]], [[V1_0]] -; CHECK-NEXT: [[TMP0_1:%.*]] = add i64 [[V0_1]], [[V1_1]] -; CHECK-NEXT: [[TMP1_0:%.*]] = sub i64 [[V0_0]], [[V1_0]] -; CHECK-NEXT: [[TMP1_1:%.*]] = sub i64 [[V0_1]], [[V1_1]] -; CHECK-NEXT: [[TMP2_0:%.*]] = add i64 [[TMP0_0]], [[TMP0_1]] -; CHECK-NEXT: [[TMP2_1:%.*]] = add i64 [[TMP1_0]], [[TMP1_1]] -; CHECK-NEXT: store i64 [[TMP2_0]], i64* [[C]], align 8 -; CHECK-NEXT: store i64 [[TMP2_1]], i64* [[C_1]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[A:%.*]] to <2 x i64>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[B:%.*]] to <2 x i64>* +; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP9]], <2 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = add <2 x i64> [[TMP10]], [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i64* [[C:%.*]] to <2 x i64>* +; CHECK-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* [[TMP12]], align 8 ; CHECK-NEXT: ret void ; %a.0 = getelementptr i64, i64* %a, i64 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll index 623fb60..96502d4 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll @@ -600,21 +600,18 @@ define void @ChecksExtractScores_different_vectors(double* %storeArray, double* ; CHECK-NEXT: [[LOADVEC4:%.*]] = load <2 x double>, <2 x double>* [[VECPTR4:%.*]], align 4 ; CHECK-NEXT: [[EXTRB0:%.*]] = extractelement <2 x double> [[LOADVEC3]], i32 0 ; CHECK-NEXT: [[EXTRB1:%.*]] = extractelement <2 x double> [[LOADVEC4]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[EXTRB0]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[EXTRA1]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP6]], double [[TMP7]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = fmul <2 x double> [[TMP4]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x double> poison, double [[EXTRA0]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x double> [[TMP10]], double [[EXTRB1]], i32 1 -; CHECK-NEXT: [[TMP12:%.*]] = fmul <2 x double> [[TMP11]], [[TMP2]] -; CHECK-NEXT: [[TMP13:%.*]] = fadd <2 x double> [[TMP12]], [[TMP9]] +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[EXTRA1]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[EXTRB0]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP4]], [[TMP2]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[EXTRA0]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[EXTRB1]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x double> [[TMP7]], [[TMP2]] +; CHECK-NEXT: [[TMP9:%.*]] = fadd <2 x double> [[TMP8]], [[SHUFFLE]] ; CHECK-NEXT: [[SIDX0:%.*]] = getelementptr inbounds double, double* [[STOREARRAY:%.*]], i64 0 ; CHECK-NEXT: [[SIDX1:%.*]] = getelementptr inbounds double, double* [[STOREARRAY]], i64 1 -; CHECK-NEXT: [[TMP14:%.*]] = bitcast double* [[SIDX0]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP13]], <2 x double>* [[TMP14]], align 8 +; CHECK-NEXT: [[TMP10:%.*]] = bitcast double* [[SIDX0]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[TMP10]], align 8 ; CHECK-NEXT: ret void ; %idx0 = getelementptr inbounds double, double* %array, i64 0 -- 2.7.4