From: Eric Christopher Date: Fri, 18 Sep 2020 19:33:12 +0000 (-0700) Subject: Temporarily Revert "[SLP] Allow reordering of vectorization trees with reused instruc... X-Git-Tag: llvmorg-13-init~11583 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=ecfd8161bf43d035eafb75c14e9cf4a6d3966946;p=platform%2Fupstream%2Fllvm.git Temporarily Revert "[SLP] Allow reordering of vectorization trees with reused instructions." as it's infinite looping on occasion. This reverts commit 455ca0ebb69210046928fedffe292420a30f89ad. --- diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index e4cad01..c487301 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -523,15 +523,6 @@ static bool isSimple(Instruction *I) { namespace llvm { -static void inversePermutation(ArrayRef Indices, - SmallVectorImpl &Mask) { - Mask.clear(); - const unsigned E = Indices.size(); - Mask.resize(E, E + 1); - for (unsigned I = 0; I < E; ++I) - Mask[Indices[I]] = I; -} - namespace slpvectorizer { /// Bottom Up SLP Vectorizer. @@ -546,7 +537,6 @@ public: using StoreList = SmallVector; using ExtraValueToDebugLocsMap = MapVector>; - using OrdersType = SmallVector; BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li, @@ -624,14 +614,6 @@ public: /// \returns The best order of instructions for vectorization. Optional> bestOrder() const { - assert(llvm::all_of( - NumOpsWantToKeepOrder, - [this](const decltype(NumOpsWantToKeepOrder)::value_type &D) { - return D.getFirst().size() == - VectorizableTree[0]->Scalars.size(); - }) && - "All orders must have the same size as number of instructions in " - "tree node."); auto I = std::max_element( NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(), [](const decltype(NumOpsWantToKeepOrder)::value_type &D1, @@ -645,79 +627,6 @@ public: return makeArrayRef(I->getFirst()); } - /// Builds the correct order for root instructions. - /// If some leaves have the same instructions to be vectorized, we may - /// incorrectly evaluate the best order for the root node (it is built for the - /// vector of instructions without repeated instructions and, thus, has less - /// elements than the root node). This function builds the correct order for - /// the root node. - /// For example, if the root node is \, then the leaves - /// are \ and \. When we try to vectorize the first - /// leaf, it will be shrink to \. If instructions in this leaf should - /// be reordered, the best order will be \<1, 0\>. We need to extend this - /// order for the root node. For the root node this order should look like - /// \<3, 0, 1, 2\>. This function extends the order for the reused - /// instructions. - void findRootOrder(OrdersType &Order) { - // If the leaf has the same number of instructions to vectorize as the root - // - order must be set already. - unsigned RootSize = VectorizableTree[0]->Scalars.size(); - if (Order.size() == RootSize) - return; - SmallVector RealOrder(Order.size()); - std::swap(Order, RealOrder); - SmallVector Mask; - inversePermutation(RealOrder, Mask); - for (int I = 0, E = Mask.size(); I < E; ++I) - Order[I] = Mask[I]; - // The leaf has less number of instructions - need to find the true order of - // the root. - // Scan the nodes starting from the leaf back to the root. - const TreeEntry *PNode = VectorizableTree.back().get(); - while (PNode) { - const TreeEntry &Node = *PNode; - PNode = Node.UserTreeIndices.back().UserTE; - if (Node.ReuseShuffleIndices.empty()) - continue; - // Build the order for the parent node. - OrdersType NewOrder(Node.ReuseShuffleIndices.size(), RootSize); - SmallVector OrderCounter(Order.size(), 0); - // The algorithm of the order extension is: - // 1. Calculate the number of the same instructions for the order. - // 2. Calculate the index of the new order: total number of instructions - // with order less than the order of the current instruction + reuse - // number of the current instruction. - // 3. The new order is just the index of the instruction in the original - // vector of the instructions. - for (unsigned I : Node.ReuseShuffleIndices) - ++OrderCounter[Order[I]]; - SmallVector CurrentCounter(Order.size(), 0); - for (unsigned I = 0, E = Node.ReuseShuffleIndices.size(); I < E; ++I) { - unsigned ReusedIdx = Node.ReuseShuffleIndices[I]; - unsigned OrderIdx = Order[ReusedIdx]; - unsigned NewIdx = 0; - for (unsigned J = 0; J < OrderIdx; ++J) - NewIdx += OrderCounter[J]; - NewIdx += CurrentCounter[OrderIdx]; - ++CurrentCounter[OrderIdx]; - assert(NewOrder[NewIdx] == RootSize && - "The order index should not be written already."); - NewOrder[NewIdx] = I; - } - std::swap(Order, NewOrder); - // If the size of the order is the same as number of instructions in the - // root node, no need to extend it more. - if (Order.size() == RootSize) - break; - } - assert((!PNode || Order.size() == RootSize) && - "Root node is expected or the size of the order must be the same as " - "the number of elements in the root node."); - assert(llvm::all_of(Order, - [RootSize](unsigned Val) { return Val != RootSize; }) && - "All indices must be initialized"); - } - /// \return The vector element size in bits to use when vectorizing the /// expression tree ending at \p V. If V is a store, the size is the width of /// the stored value. Otherwise, the size is the width of the largest loaded @@ -1558,7 +1467,7 @@ private: SmallVector ReuseShuffleIndices; /// Does this entry require reordering? - SmallVector ReorderIndices; + ArrayRef ReorderIndices; /// Points back to the VectorizableTree. /// @@ -1751,7 +1660,7 @@ private: Last->State = Vectorized ? TreeEntry::Vectorize : TreeEntry::NeedToGather; Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(), ReuseShuffleIndices.end()); - Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end()); + Last->ReorderIndices = ReorderIndices; Last->setOperations(S); if (Vectorized) { for (int i = 0, e = VL.size(); i != e; ++i) { @@ -2288,6 +2197,7 @@ private: /// List of users to ignore during scheduling and that don't need extracting. ArrayRef UserIgnoreList; + using OrdersType = SmallVector; /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of /// sorted SmallVectors of unsigned. struct OrdersTypeDenseMapInfo { @@ -2749,10 +2659,12 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, }); // Insert new order with initial value 0, if it does not exist, // otherwise return the iterator to the existing one. + auto StoredCurrentOrderAndNum = + NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first; + ++StoredCurrentOrderAndNum->getSecond(); newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies, CurrentOrder); - findRootOrder(CurrentOrder); - ++NumOpsWantToKeepOrder[CurrentOrder]; + ReuseShuffleIndicies, + StoredCurrentOrderAndNum->getFirst()); // This is a special case, as it does not gather, but at the same time // we are not extending buildTree_rec() towards the operands. ValueList Op0; @@ -2829,13 +2741,13 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n"); } else { // Need to reorder. + auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first; + ++I->getSecond(); TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies, CurrentOrder); + ReuseShuffleIndicies, I->getFirst()); TE->setOperandsInOrder(); LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n"); - findRootOrder(CurrentOrder); - ++NumOpsWantToKeepOrder[CurrentOrder]; } return; } @@ -3091,14 +3003,15 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, buildTree_rec(Operands, Depth + 1, {TE, 0}); LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n"); } else { + // Need to reorder. + auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first; + ++(I->getSecond()); TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies, CurrentOrder); + ReuseShuffleIndicies, I->getFirst()); TE->setOperandsInOrder(); buildTree_rec(Operands, Depth + 1, {TE, 0}); LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n"); - findRootOrder(CurrentOrder); - ++NumOpsWantToKeepOrder[CurrentOrder]; } return; } @@ -4228,6 +4141,15 @@ Value *BoUpSLP::vectorizeTree(ArrayRef VL) { return V; } +static void inversePermutation(ArrayRef Indices, + SmallVectorImpl &Mask) { + Mask.clear(); + const unsigned E = Indices.size(); + Mask.resize(E); + for (unsigned I = 0; I < E; ++I) + Mask[Indices[I]] = I; +} + Value *BoUpSLP::vectorizeTree(TreeEntry *E) { IRBuilder<>::InsertPointGuard Guard(Builder); @@ -6951,10 +6873,8 @@ public: ArrayRef VL = makeArrayRef(&ReducedVals[i], ReduxWidth); V.buildTree(VL, ExternallyUsedValues, IgnoreList); Optional> Order = V.bestOrder(); - if (Order) { - assert(Order->size() == VL.size() && - "Order size must be the same as number of vectorized " - "instructions."); + // TODO: Handle orders of size less than number of elements in the vector. + if (Order && Order->size() == VL.size()) { // TODO: reorder tree nodes without tree rebuilding. SmallVector ReorderedOps(VL.size()); llvm::transform(*Order, ReorderedOps.begin(), diff --git a/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll b/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll index a84b1f7..8b12b92 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll @@ -11,7 +11,7 @@ @h = common dso_local global float 0.000000e+00, align 4 define dso_local void @j() local_unnamed_addr { -; CHECK-LABEL: @j( +; CHECK-LABEL: define {{[^@]+}}@j( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load i32*, i32** @b, align 8 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4 @@ -19,39 +19,42 @@ define dso_local void @j() local_unnamed_addr { ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[ARRAYIDX]] to <2 x i32>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4 +; CHECK-NEXT: [[REORDER_SHUFFLE1:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> undef, <2 x i32> ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 13 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[ARRAYIDX1]] to <2 x i32>* ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[TMP4]], [[TMP2]] +; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[REORDER_SHUFFLE]], [[REORDER_SHUFFLE1]] ; CHECK-NEXT: [[TMP6:%.*]] = sitofp <2 x i32> [[TMP5]] to <2 x float> ; CHECK-NEXT: [[TMP7:%.*]] = fmul <2 x float> [[TMP6]], -; CHECK-NEXT: [[TMP8:%.*]] = fsub <2 x float> , [[TMP7]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> undef, <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = fsub <2 x float> , [[TMP7]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> undef, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 1 ; CHECK-NEXT: store float [[TMP9]], float* @g, align 4 -; CHECK-NEXT: [[TMP10:%.*]] = fadd <4 x float> [[SHUFFLE]], -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP10]], i32 3 +; CHECK-NEXT: [[TMP10:%.*]] = fadd <4 x float> [[SHUFFLE]], +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP10]], i32 2 ; CHECK-NEXT: store float [[TMP11]], float* @c, align 4 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[TMP10]], i32 2 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[TMP10]], i32 0 ; CHECK-NEXT: store float [[TMP12]], float* @d, align 4 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP10]], i32 1 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP10]], i32 3 ; CHECK-NEXT: store float [[TMP13]], float* @e, align 4 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[TMP10]], i32 1 ; CHECK-NEXT: store float [[TMP14]], float* @f, align 4 ; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 14 ; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 15 ; CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* @a, align 4 ; CHECK-NEXT: [[CONV19:%.*]] = sitofp i32 [[TMP15]] to float -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x float> , float [[CONV19]], i32 2 -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 2 -; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x float> [[TMP16]], float [[TMP17]], i32 3 -; CHECK-NEXT: [[TMP19:%.*]] = fadd <4 x float> [[TMP10]], [[TMP18]] -; CHECK-NEXT: [[TMP20:%.*]] = fsub <4 x float> [[TMP10]], [[TMP18]] -; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x float> [[TMP19]], <4 x float> [[TMP20]], <4 x i32> -; CHECK-NEXT: [[TMP22:%.*]] = fptosi <4 x float> [[TMP21]] to <4 x i32> -; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP22]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = bitcast i32* [[ARRAYIDX1]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[REORDER_SHUFFLE]], <4 x i32>* [[TMP23]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x float> undef, float [[CONV19]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x float> [[TMP16]], float -1.000000e+00, i32 1 +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 0 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x float> [[TMP17]], float [[TMP18]], i32 2 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x float> [[TMP19]], float -1.000000e+00, i32 3 +; CHECK-NEXT: [[TMP21:%.*]] = fsub <4 x float> [[TMP10]], [[TMP20]] +; CHECK-NEXT: [[TMP22:%.*]] = fadd <4 x float> [[TMP10]], [[TMP20]] +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x float> [[TMP21]], <4 x float> [[TMP22]], <4 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = fptosi <4 x float> [[TMP23]] to <4 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = bitcast i32* [[ARRAYIDX1]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP24]], <4 x i32>* [[TMP25]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll index 9ed21a1..384e540 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll @@ -14,10 +14,11 @@ define void @hoge() { ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i16> undef, i16 [[T]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i16> [[TMP0]], i16 undef, i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = sext <2 x i16> [[TMP1]] to <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = sub nsw <2 x i32> , [[TMP2]] +; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = sub nsw <2 x i32> , [[REORDER_SHUFFLE]] ; CHECK-NEXT: [[TMP4:%.*]] = sub <2 x i32> [[TMP3]], undef -; CHECK-NEXT: [[SHUFFLE5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[SHUFFLE5]], +; CHECK-NEXT: [[SHUFFLE5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[SHUFFLE5]], ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> [[TMP5]]) ; CHECK-NEXT: [[T19:%.*]] = select i1 undef, i32 [[TMP6]], i32 undef ; CHECK-NEXT: [[T20:%.*]] = icmp sgt i32 [[T19]], 63 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll index 02e7c5b..b7cff2d 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll @@ -7,15 +7,16 @@ define i32 @foo(i32* nocapture readonly %arr, i32 %a1, i32 %a2, i32 %a3, i32 %a4 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 1 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARR]] to <2 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A7:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A8:%.*]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A1:%.*]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A2:%.*]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A3:%.*]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A4:%.*]], i32 5 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A5:%.*]], i32 6 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A6:%.*]], i32 7 +; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[REORDER_SHUFFLE]], <2 x i32> undef, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A1:%.*]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A2:%.*]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A8:%.*]], i32 7 ; CHECK-NEXT: [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]]) ; CHECK-NEXT: ret i32 [[TMP11]] @@ -57,15 +58,16 @@ define i32 @foo1(i32* nocapture readonly %arr, i32 %a1, i32 %a2, i32 %a3, i32 %a ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[ARR]], i64 3 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARR]] to <4 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A6:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A1:%.*]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A4:%.*]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A5:%.*]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A8:%.*]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A2:%.*]], i32 5 +; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[REORDER_SHUFFLE]], <4 x i32> undef, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A1:%.*]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A2:%.*]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A3:%.*]], i32 7 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A8:%.*]], i32 7 ; CHECK-NEXT: [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]]) ; CHECK-NEXT: ret i32 [[TMP11]] @@ -111,15 +113,16 @@ define i32 @foo2(i32* nocapture readonly %arr, i32 %a1, i32 %a2, i32 %a3, i32 %a ; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[ARR]], i64 1 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARR]] to <4 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A4:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A6:%.*]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A5:%.*]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A8:%.*]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A2:%.*]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A7:%.*]], i32 5 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A1:%.*]], i32 6 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A3:%.*]], i32 7 +; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[REORDER_SHUFFLE]], <4 x i32> undef, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A1:%.*]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A2:%.*]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A8:%.*]], i32 7 ; CHECK-NEXT: [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.experimental.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]]) ; CHECK-NEXT: ret i32 [[TMP11]]