From 150ea765431280b931e0673448863ec98f1d0ac3 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Wed, 16 Mar 2022 13:53:36 -0700 Subject: [PATCH] Revert "[SLP]Do not schedule instructions with constants/argument/phi operands and external users." This reverts commit 1eeb2bfe727323332800e8d390f2f8c63c953779 to fix a bug reported in https://reviews.llvm.org/D121121 --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 175 ++++----------------- .../SLPVectorizer/AArch64/gather-reduce.ll | 8 +- .../SLPVectorizer/AArch64/gather-root.ll | 67 ++++++-- .../SLPVectorizer/AArch64/spillcost-di.ll | 8 +- .../SLPVectorizer/AArch64/trunc-insertion.ll | 4 +- .../test/Transforms/SLPVectorizer/X86/PR35628_2.ll | 2 +- llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll | 8 +- .../Transforms/SLPVectorizer/X86/barriercall.ll | 2 +- .../SLPVectorizer/X86/consecutive-access.ll | 2 +- .../Transforms/SLPVectorizer/X86/crash_cmpop.ll | 2 +- .../SLPVectorizer/X86/crash_exceed_scheduling.ll | 22 +-- .../SLPVectorizer/X86/cross_block_slp.ll | 2 +- .../test/Transforms/SLPVectorizer/X86/cycle_dup.ll | 2 +- .../Transforms/SLPVectorizer/X86/external_user.ll | 2 +- .../Transforms/SLPVectorizer/X86/geps-non-pow-2.ll | 14 +- .../Transforms/SLPVectorizer/X86/multi_block.ll | 8 +- .../Transforms/SLPVectorizer/X86/opaque-ptr.ll | 4 +- llvm/test/Transforms/SLPVectorizer/X86/phi.ll | 22 +-- .../SLPVectorizer/X86/pr47629-inseltpoison.ll | 92 +++++------ llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll | 92 +++++------ llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll | 8 +- llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll | 20 +-- .../SLPVectorizer/X86/shrink_after_reorder2.ll | 8 +- .../SLPVectorizer/X86/sitofp-inseltpoison.ll | 12 +- llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll | 12 +- .../SLPVectorizer/X86/stores-non-ordered.ll | 20 +-- .../SLPVectorizer/X86/vectorize-widest-phis.ll | 2 +- .../Transforms/SLPVectorizer/slp-max-phi-size.ll | 80 +++++----- 28 files changed, 311 insertions(+), 389 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 80b6ff7..90114f9 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -776,57 +776,6 @@ static void reorderScalars(SmallVectorImpl &Scalars, Scalars[Mask[I]] = Prev[I]; } -/// Checks if the provided value does not require scheduling. It does not -/// require scheduling if this is not an instruction or it is an instruction -/// that does not read/write memory and all operands are either not instructions -/// or phi nodes or instructions from different blocks. -static bool areAllOperandsNonInsts(Value *V) { - auto *I = dyn_cast(V); - if (!I) - return true; - return !I->mayReadOrWriteMemory() && all_of(I->operands(), [I](Value *V) { - auto *IO = dyn_cast(V); - if (!IO) - return true; - return isa(IO) || IO->getParent() != I->getParent(); - }); -} - -/// Checks if the provided value does not require scheduling. It does not -/// require scheduling if this is not an instruction or it is an instruction -/// that does not read/write memory and all users are phi nodes or instructions -/// from the different blocks. -static bool isUsedOutsideBlock(Value *V) { - auto *I = dyn_cast(V); - if (!I) - return true; - // Limits the number of uses to save compile time. - constexpr int UsesLimit = 8; - return !I->mayReadOrWriteMemory() && !I->hasNUsesOrMore(UsesLimit) && - all_of(I->users(), [I](User *U) { - auto *IU = dyn_cast(U); - if (!IU) - return true; - return IU->getParent() != I->getParent() || isa(IU); - }); -} - -/// Checks if the specified value does not require scheduling. It does not -/// require scheduling if all operands and all users do not need to be scheduled -/// in the current basic block. -static bool doesNotNeedToBeScheduled(Value *V) { - return areAllOperandsNonInsts(V) && isUsedOutsideBlock(V); -} - -/// Checks if the specified array of instructions does not require scheduling. -/// It is so if all either instructions have operands that do not require -/// scheduling or their users do not require scheduling since they are phis or -/// in other basic blocks. -static bool doesNotNeedToSchedule(ArrayRef VL) { - return !VL.empty() && - (all_of(VL, isUsedOutsideBlock) || all_of(VL, areAllOperandsNonInsts)); -} - namespace slpvectorizer { /// Bottom Up SLP Vectorizer. @@ -2410,21 +2359,15 @@ private: ScalarToTreeEntry[V] = Last; } // Update the scheduler bundle to point to this TreeEntry. - ScheduleData *BundleMember = Bundle.getValue(); - assert((BundleMember || isa(S.MainOp) || - isVectorLikeInstWithConstOps(S.MainOp) || - doesNotNeedToSchedule(VL)) && - "Bundle and VL out of sync"); - if (BundleMember) { - for (Value *V : VL) { - if (doesNotNeedToBeScheduled(V)) - continue; - assert(BundleMember && "Unexpected end of bundle."); - BundleMember->TE = Last; - BundleMember = BundleMember->NextInBundle; - } + unsigned Lane = 0; + for (ScheduleData *BundleMember = Bundle.getValue(); BundleMember; + BundleMember = BundleMember->NextInBundle) { + BundleMember->TE = Last; + BundleMember->Lane = Lane; + ++Lane; } - assert(!BundleMember && "Bundle and VL out of sync"); + assert((!Bundle.getValue() || Lane == VL.size()) && + "Bundle and VL out of sync"); } else { MustGather.insert(VL.begin(), VL.end()); } @@ -2561,6 +2504,7 @@ private: clearDependencies(); OpValue = OpVal; TE = nullptr; + Lane = -1; } /// Verify basic self consistency properties @@ -2600,7 +2544,7 @@ private: /// Returns true if it represents an instruction bundle and not only a /// single instruction. bool isPartOfBundle() const { - return NextInBundle != nullptr || FirstInBundle != this || TE; + return NextInBundle != nullptr || FirstInBundle != this; } /// Returns true if it is ready for scheduling, i.e. it has no more @@ -2705,6 +2649,9 @@ private: /// Note that this is negative as long as Dependencies is not calculated. int UnscheduledDeps = InvalidDeps; + /// The lane of this node in the TreeEntry. + int Lane = -1; + /// True if this instruction is scheduled (or considered as scheduled in the /// dry-run). bool IsScheduled = false; @@ -2722,21 +2669,6 @@ private: friend struct DOTGraphTraits; /// Contains all scheduling data for a basic block. - /// It does not schedules instructions, which are not memory read/write - /// instructions and their operands are either constants, or arguments, or - /// phis, or instructions from others blocks, or their users are phis or from - /// the other blocks. The resulting vector instructions can be placed at the - /// beginning of the basic block without scheduling (if operands does not need - /// to be scheduled) or at the end of the block (if users are outside of the - /// block). It allows to save some compile time and memory used by the - /// compiler. - /// ScheduleData is assigned for each instruction in between the boundaries of - /// the tree entry, even for those, which are not part of the graph. It is - /// required to correctly follow the dependencies between the instructions and - /// their correct scheduling. The ScheduleData is not allocated for the - /// instructions, which do not require scheduling, like phis, nodes with - /// extractelements/insertelements only or nodes with instructions, with - /// uses/operands outside of the block. struct BlockScheduling { BlockScheduling(BasicBlock *BB) : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {} @@ -2764,7 +2696,7 @@ private: if (BB != I->getParent()) // Avoid lookup if can't possibly be in map. return nullptr; - ScheduleData *SD = ScheduleDataMap.lookup(I); + ScheduleData *SD = ScheduleDataMap[I]; if (SD && isInSchedulingRegion(SD)) return SD; return nullptr; @@ -2781,7 +2713,7 @@ private: return getScheduleData(V); auto I = ExtraScheduleDataMap.find(V); if (I != ExtraScheduleDataMap.end()) { - ScheduleData *SD = I->second.lookup(Key); + ScheduleData *SD = I->second[Key]; if (SD && isInSchedulingRegion(SD)) return SD; } @@ -2803,7 +2735,7 @@ private: BundleMember = BundleMember->NextInBundle) { if (BundleMember->Inst != BundleMember->OpValue) continue; - + // Handle the def-use chain dependencies. // Decrement the unscheduled counter and insert to ready list if ready. @@ -2828,9 +2760,7 @@ private: // reordered during buildTree(). We therefore need to get its operands // through the TreeEntry. if (TreeEntry *TE = BundleMember->TE) { - // Need to search for the lane since the tree entry can be reordered. - int Lane = std::distance(TE->Scalars.begin(), - find(TE->Scalars, BundleMember->Inst)); + int Lane = BundleMember->Lane; assert(Lane >= 0 && "Lane not set"); // Since vectorization tree is being built recursively this assertion @@ -2839,7 +2769,7 @@ private: // where their second (immediate) operand is not added. Since // immediates do not affect scheduler behavior this is considered // okay. - auto *In = BundleMember->Inst; + auto *In = TE->getMainOp(); assert(In && (isa(In) || isa(In) || In->getNumOperands() == TE->getNumOperands()) && @@ -2884,8 +2814,7 @@ private: for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { auto *SD = getScheduleData(I); - if (!SD) - continue; + assert(SD && "primary scheduledata must exist in window"); assert(isInSchedulingRegion(SD) && "primary schedule data not in window?"); assert(isInSchedulingRegion(SD->FirstInBundle) && @@ -3927,22 +3856,6 @@ static LoadsState canVectorizeLoads(ArrayRef VL, const Value *VL0, return LoadsState::Gather; } -/// \return true if the specified list of values has only one instruction that -/// requires scheduling, false otherwise. -static bool needToScheduleSingleInstruction(ArrayRef VL) { - Value *NeedsScheduling = nullptr; - for (Value *V : VL) { - if (doesNotNeedToBeScheduled(V)) - continue; - if (!NeedsScheduling) { - NeedsScheduling = V; - continue; - } - return false; - } - return NeedsScheduling; -} - void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, const EdgeInfo &UserTreeIdx) { assert((allConstant(VL) || allSameType(VL)) && "Invalid types!"); @@ -6485,20 +6398,6 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) { return !E->isOpcodeOrAlt(I) || I->getParent() == BB; })); - // Set the insert point to the beginning of the basic block if the entry - // should not be scheduled. - if (E->State != TreeEntry::NeedToGather && - doesNotNeedToSchedule(E->Scalars)) { - BasicBlock::iterator InsertPt; - if (all_of(E->Scalars, isUsedOutsideBlock)) - InsertPt = BB->getTerminator()->getIterator(); - else - InsertPt = BB->getFirstInsertionPt(); - Builder.SetInsertPoint(BB, InsertPt); - Builder.SetCurrentDebugLocation(Front->getDebugLoc()); - return; - } - // The last instruction in the bundle in program order. Instruction *LastInst = nullptr; @@ -6507,10 +6406,8 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) { // VL.back() and iterate over schedule data until we reach the end of the // bundle. The end of the bundle is marked by null ScheduleData. if (BlocksSchedules.count(BB)) { - Value *V = E->isOneOf(E->Scalars.back()); - if (doesNotNeedToBeScheduled(V)) - V = *find_if_not(E->Scalars, doesNotNeedToBeScheduled); - auto *Bundle = BlocksSchedules[BB]->getScheduleData(V); + auto *Bundle = + BlocksSchedules[BB]->getScheduleData(E->isOneOf(E->Scalars.back())); if (Bundle && Bundle->isPartOfBundle()) for (; Bundle; Bundle = Bundle->NextInBundle) if (Bundle->OpValue == Bundle->Inst) @@ -7736,11 +7633,9 @@ void BoUpSLP::optimizeGatherSequence() { BoUpSLP::ScheduleData * BoUpSLP::BlockScheduling::buildBundle(ArrayRef VL) { - ScheduleData *Bundle = nullptr; + ScheduleData *Bundle = nullptr; ScheduleData *PrevInBundle = nullptr; for (Value *V : VL) { - if (doesNotNeedToBeScheduled(V)) - continue; ScheduleData *BundleMember = getScheduleData(V); assert(BundleMember && "no ScheduleData for bundle member " @@ -7768,8 +7663,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, const InstructionsState &S) { // No need to schedule PHIs, insertelement, extractelement and extractvalue // instructions. - if (isa(S.OpValue) || isVectorLikeInstWithConstOps(S.OpValue) || - doesNotNeedToSchedule(VL)) + if (isa(S.OpValue) || isVectorLikeInstWithConstOps(S.OpValue)) return nullptr; // Initialize the instruction bundle. @@ -7815,8 +7709,6 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, // Make sure that the scheduling region contains all // instructions of the bundle. for (Value *V : VL) { - if (doesNotNeedToBeScheduled(V)) - continue; if (!extendSchedulingRegion(V, S)) { // If the scheduling region got new instructions at the lower end (or it // is a new region for the first bundle). This makes it necessary to @@ -7831,8 +7723,6 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, bool ReSchedule = false; for (Value *V : VL) { - if (doesNotNeedToBeScheduled(V)) - continue; ScheduleData *BundleMember = getScheduleData(V); assert(BundleMember && "no ScheduleData for bundle member (maybe not in same basic block)"); @@ -7862,18 +7752,14 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef VL, Value *OpValue) { - if (isa(OpValue) || isVectorLikeInstWithConstOps(OpValue) || - doesNotNeedToSchedule(VL)) + if (isa(OpValue) || isVectorLikeInstWithConstOps(OpValue)) return; - if (doesNotNeedToBeScheduled(OpValue)) - OpValue = *find_if_not(VL, doesNotNeedToBeScheduled); ScheduleData *Bundle = getScheduleData(OpValue); LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n"); assert(!Bundle->IsScheduled && "Can't cancel bundle which is already scheduled"); - assert(Bundle->isSchedulingEntity() && - (Bundle->isPartOfBundle() || needToScheduleSingleInstruction(VL)) && + assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() && "tried to unbundle something which is not a bundle"); // Remove the bundle from the ready list. @@ -7887,7 +7773,6 @@ void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef VL, BundleMember->FirstInBundle = BundleMember; ScheduleData *Next = BundleMember->NextInBundle; BundleMember->NextInBundle = nullptr; - BundleMember->TE = nullptr; if (BundleMember->unscheduledDepsInBundle() == 0) { ReadyInsts.insert(BundleMember); } @@ -7911,7 +7796,6 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V, Instruction *I = dyn_cast(V); assert(I && "bundle member must be an instruction"); assert(!isa(I) && !isVectorLikeInstWithConstOps(I) && - !doesNotNeedToBeScheduled(I) && "phi nodes/insertelements/extractelements/extractvalues don't need to " "be scheduled"); auto &&CheckScheduleForI = [this, &S](Instruction *I) -> bool { @@ -7988,10 +7872,7 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI, ScheduleData *NextLoadStore) { ScheduleData *CurrentLoadStore = PrevLoadStore; for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) { - // No need to allocate data for non-schedulable instructions. - if (doesNotNeedToBeScheduled(I)) - continue; - ScheduleData *SD = ScheduleDataMap.lookup(I); + ScheduleData *SD = ScheduleDataMap[I]; if (!SD) { SD = allocateScheduleDataChunks(); ScheduleDataMap[I] = SD; @@ -8175,10 +8056,8 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) { BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) { - TreeEntry *SDTE = getTreeEntry(SD->Inst); assert((isVectorLikeInstWithConstOps(SD->Inst) || - SD->isPartOfBundle() == - (SDTE && !doesNotNeedToSchedule(SDTE->Scalars))) && + SD->isPartOfBundle() == (getTreeEntry(SD->Inst) != nullptr)) && "scheduler and vectorizer bundle mismatch"); SD->FirstInBundle->SchedulingPriority = Idx++; if (SD->isSchedulingEntity()) { diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll index ec7b03a..536f72a 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll @@ -36,7 +36,6 @@ define i32 @gather_reduce_8x16_i32(i16* nocapture readonly %a, i16* nocapture re ; GENERIC-NEXT: [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; GENERIC-NEXT: [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; GENERIC-NEXT: [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ] -; GENERIC-NEXT: [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8 ; GENERIC-NEXT: [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>* ; GENERIC-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2 ; GENERIC-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32> @@ -86,6 +85,7 @@ define i32 @gather_reduce_8x16_i32(i16* nocapture readonly %a, i16* nocapture re ; GENERIC-NEXT: [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX55]], align 2 ; GENERIC-NEXT: [[CONV56:%.*]] = zext i16 [[TMP27]] to i32 ; GENERIC-NEXT: [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]] +; GENERIC-NEXT: [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8 ; GENERIC-NEXT: [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i64 7 ; GENERIC-NEXT: [[TMP29:%.*]] = sext i32 [[TMP28]] to i64 ; GENERIC-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP29]] @@ -111,7 +111,6 @@ define i32 @gather_reduce_8x16_i32(i16* nocapture readonly %a, i16* nocapture re ; KRYO-NEXT: [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; KRYO-NEXT: [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; KRYO-NEXT: [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ] -; KRYO-NEXT: [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8 ; KRYO-NEXT: [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>* ; KRYO-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2 ; KRYO-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32> @@ -161,6 +160,7 @@ define i32 @gather_reduce_8x16_i32(i16* nocapture readonly %a, i16* nocapture re ; KRYO-NEXT: [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX55]], align 2 ; KRYO-NEXT: [[CONV56:%.*]] = zext i16 [[TMP27]] to i32 ; KRYO-NEXT: [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]] +; KRYO-NEXT: [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8 ; KRYO-NEXT: [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i64 7 ; KRYO-NEXT: [[TMP29:%.*]] = sext i32 [[TMP28]] to i64 ; KRYO-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP29]] @@ -297,7 +297,6 @@ define i32 @gather_reduce_8x16_i64(i16* nocapture readonly %a, i16* nocapture re ; GENERIC-NEXT: [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; GENERIC-NEXT: [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; GENERIC-NEXT: [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ] -; GENERIC-NEXT: [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8 ; GENERIC-NEXT: [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>* ; GENERIC-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2 ; GENERIC-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32> @@ -347,6 +346,7 @@ define i32 @gather_reduce_8x16_i64(i16* nocapture readonly %a, i16* nocapture re ; GENERIC-NEXT: [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX55]], align 2 ; GENERIC-NEXT: [[CONV56:%.*]] = zext i16 [[TMP27]] to i32 ; GENERIC-NEXT: [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]] +; GENERIC-NEXT: [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8 ; GENERIC-NEXT: [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i64 7 ; GENERIC-NEXT: [[TMP29:%.*]] = sext i32 [[TMP28]] to i64 ; GENERIC-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP29]] @@ -372,7 +372,6 @@ define i32 @gather_reduce_8x16_i64(i16* nocapture readonly %a, i16* nocapture re ; KRYO-NEXT: [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; KRYO-NEXT: [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; KRYO-NEXT: [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ] -; KRYO-NEXT: [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8 ; KRYO-NEXT: [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>* ; KRYO-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2 ; KRYO-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32> @@ -422,6 +421,7 @@ define i32 @gather_reduce_8x16_i64(i16* nocapture readonly %a, i16* nocapture re ; KRYO-NEXT: [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX55]], align 2 ; KRYO-NEXT: [[CONV56:%.*]] = zext i16 [[TMP27]] to i32 ; KRYO-NEXT: [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]] +; KRYO-NEXT: [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8 ; KRYO-NEXT: [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i64 7 ; KRYO-NEXT: [[TMP29:%.*]] = sext i32 [[TMP28]] to i64 ; KRYO-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP29]] diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll index 01d743f..e9c502b 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll @@ -35,14 +35,41 @@ define void @PR28330(i32 %n) { ; ; MAX-COST-LABEL: @PR28330( ; MAX-COST-NEXT: entry: -; MAX-COST-NEXT: [[TMP0:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <8 x i8>*), align 1 -; MAX-COST-NEXT: [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer +; MAX-COST-NEXT: [[P0:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1), align 1 +; MAX-COST-NEXT: [[P1:%.*]] = icmp eq i8 [[P0]], 0 +; MAX-COST-NEXT: [[P2:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 2), align 2 +; MAX-COST-NEXT: [[P3:%.*]] = icmp eq i8 [[P2]], 0 +; MAX-COST-NEXT: [[P4:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1 +; MAX-COST-NEXT: [[P5:%.*]] = icmp eq i8 [[P4]], 0 +; MAX-COST-NEXT: [[P6:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4 +; MAX-COST-NEXT: [[P7:%.*]] = icmp eq i8 [[P6]], 0 +; MAX-COST-NEXT: [[P8:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1 +; MAX-COST-NEXT: [[P9:%.*]] = icmp eq i8 [[P8]], 0 +; MAX-COST-NEXT: [[P10:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2 +; MAX-COST-NEXT: [[P11:%.*]] = icmp eq i8 [[P10]], 0 +; MAX-COST-NEXT: [[P12:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 7), align 1 +; MAX-COST-NEXT: [[P13:%.*]] = icmp eq i8 [[P12]], 0 +; MAX-COST-NEXT: [[P14:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 8), align 8 +; MAX-COST-NEXT: [[P15:%.*]] = icmp eq i8 [[P14]], 0 ; MAX-COST-NEXT: br label [[FOR_BODY:%.*]] ; MAX-COST: for.body: -; MAX-COST-NEXT: [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; MAX-COST-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> , <8 x i32> -; MAX-COST-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]]) -; MAX-COST-NEXT: [[OP_EXTRA]] = add i32 [[TMP3]], [[P17]] +; MAX-COST-NEXT: [[P17:%.*]] = phi i32 [ [[P34:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; MAX-COST-NEXT: [[P19:%.*]] = select i1 [[P1]], i32 -720, i32 -80 +; MAX-COST-NEXT: [[P20:%.*]] = add i32 [[P17]], [[P19]] +; MAX-COST-NEXT: [[P21:%.*]] = select i1 [[P3]], i32 -720, i32 -80 +; MAX-COST-NEXT: [[P22:%.*]] = add i32 [[P20]], [[P21]] +; MAX-COST-NEXT: [[P23:%.*]] = select i1 [[P5]], i32 -720, i32 -80 +; MAX-COST-NEXT: [[P24:%.*]] = add i32 [[P22]], [[P23]] +; MAX-COST-NEXT: [[P25:%.*]] = select i1 [[P7]], i32 -720, i32 -80 +; MAX-COST-NEXT: [[P26:%.*]] = add i32 [[P24]], [[P25]] +; MAX-COST-NEXT: [[P27:%.*]] = select i1 [[P9]], i32 -720, i32 -80 +; MAX-COST-NEXT: [[P28:%.*]] = add i32 [[P26]], [[P27]] +; MAX-COST-NEXT: [[P29:%.*]] = select i1 [[P11]], i32 -720, i32 -80 +; MAX-COST-NEXT: [[P30:%.*]] = add i32 [[P28]], [[P29]] +; MAX-COST-NEXT: [[P31:%.*]] = select i1 [[P13]], i32 -720, i32 -80 +; MAX-COST-NEXT: [[P32:%.*]] = add i32 [[P30]], [[P31]] +; MAX-COST-NEXT: [[P33:%.*]] = select i1 [[P15]], i32 -720, i32 -80 +; MAX-COST-NEXT: [[P34]] = add i32 [[P32]], [[P33]] ; MAX-COST-NEXT: br label [[FOR_BODY]] ; entry: @@ -112,14 +139,30 @@ define void @PR32038(i32 %n) { ; ; MAX-COST-LABEL: @PR32038( ; MAX-COST-NEXT: entry: -; MAX-COST-NEXT: [[TMP0:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <8 x i8>*), align 1 -; MAX-COST-NEXT: [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer +; MAX-COST-NEXT: [[TMP0:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <4 x i8>*), align 1 +; MAX-COST-NEXT: [[TMP1:%.*]] = icmp eq <4 x i8> [[TMP0]], zeroinitializer +; MAX-COST-NEXT: [[P8:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1 +; MAX-COST-NEXT: [[P9:%.*]] = icmp eq i8 [[P8]], 0 +; MAX-COST-NEXT: [[P10:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2 +; MAX-COST-NEXT: [[P11:%.*]] = icmp eq i8 [[P10]], 0 +; MAX-COST-NEXT: [[P12:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 7), align 1 +; MAX-COST-NEXT: [[P13:%.*]] = icmp eq i8 [[P12]], 0 +; MAX-COST-NEXT: [[P14:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 8), align 8 +; MAX-COST-NEXT: [[P15:%.*]] = icmp eq i8 [[P14]], 0 ; MAX-COST-NEXT: br label [[FOR_BODY:%.*]] ; MAX-COST: for.body: -; MAX-COST-NEXT: [[P17:%.*]] = phi i32 [ [[OP_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; MAX-COST-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> , <8 x i32> -; MAX-COST-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]]) -; MAX-COST-NEXT: [[OP_EXTRA]] = add i32 [[TMP3]], -5 +; MAX-COST-NEXT: [[P17:%.*]] = phi i32 [ [[P34:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; MAX-COST-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> , <4 x i32> +; MAX-COST-NEXT: [[P27:%.*]] = select i1 [[P9]], i32 -720, i32 -80 +; MAX-COST-NEXT: [[P29:%.*]] = select i1 [[P11]], i32 -720, i32 -80 +; MAX-COST-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP2]]) +; MAX-COST-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], [[P27]] +; MAX-COST-NEXT: [[TMP5:%.*]] = add i32 [[TMP4]], [[P29]] +; MAX-COST-NEXT: [[OP_EXTRA:%.*]] = add i32 [[TMP5]], -5 +; MAX-COST-NEXT: [[P31:%.*]] = select i1 [[P13]], i32 -720, i32 -80 +; MAX-COST-NEXT: [[P32:%.*]] = add i32 [[OP_EXTRA]], [[P31]] +; MAX-COST-NEXT: [[P33:%.*]] = select i1 [[P15]], i32 -720, i32 -80 +; MAX-COST-NEXT: [[P34]] = add i32 [[P32]], [[P33]] ; MAX-COST-NEXT: br label [[FOR_BODY]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-di.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-di.ll index c145109..39f2f88 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-di.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-di.ll @@ -14,14 +14,14 @@ define void @patatino(i64 %n, i64 %i, %struct.S* %p) !dbg !7 { ; CHECK-NEXT: call void @llvm.dbg.value(metadata %struct.S* [[P:%.*]], metadata [[META20:![0-9]+]], metadata !DIExpression()), !dbg [[DBG25:![0-9]+]] ; CHECK-NEXT: [[X1:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[P]], i64 [[N]], i32 0, !dbg [[DBG26:![0-9]+]] ; CHECK-NEXT: call void @llvm.dbg.value(metadata i64 undef, metadata [[META21:![0-9]+]], metadata !DIExpression()), !dbg [[DBG27:![0-9]+]] -; CHECK-NEXT: call void @llvm.dbg.value(metadata i64 undef, metadata [[META22:![0-9]+]], metadata !DIExpression()), !dbg [[DBG28:![0-9]+]] -; CHECK-NEXT: [[Y3:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 [[N]], i32 1, !dbg [[DBG29:![0-9]+]] +; CHECK-NEXT: [[Y3:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 [[N]], i32 1, !dbg [[DBG28:![0-9]+]] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i64* [[X1]] to <2 x i64>*, !dbg [[DBG26]] -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8, !dbg [[DBG26]], !tbaa [[TBAA30:![0-9]+]] +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8, !dbg [[DBG26]], !tbaa [[TBAA29:![0-9]+]] +; CHECK-NEXT: call void @llvm.dbg.value(metadata i64 undef, metadata [[META22:![0-9]+]], metadata !DIExpression()), !dbg [[DBG33:![0-9]+]] ; CHECK-NEXT: [[X5:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 [[I]], i32 0, !dbg [[DBG34:![0-9]+]] ; CHECK-NEXT: [[Y7:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[P]], i64 [[I]], i32 1, !dbg [[DBG35:![0-9]+]] ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i64* [[X5]] to <2 x i64>*, !dbg [[DBG36:![0-9]+]] -; CHECK-NEXT: store <2 x i64> [[TMP1]], <2 x i64>* [[TMP2]], align 8, !dbg [[DBG36]], !tbaa [[TBAA30]] +; CHECK-NEXT: store <2 x i64> [[TMP1]], <2 x i64>* [[TMP2]], align 8, !dbg [[DBG36]], !tbaa [[TBAA29]] ; CHECK-NEXT: ret void, !dbg [[DBG37:![0-9]+]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/trunc-insertion.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/trunc-insertion.ll index fbd8b88..f6ab38b 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/trunc-insertion.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/trunc-insertion.ll @@ -11,14 +11,14 @@ define dso_local void @l() local_unnamed_addr { ; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i16> [ undef, [[BB:%.*]] ], [ [[TMP11:%.*]], [[BB25:%.*]] ] ; CHECK-NEXT: br i1 undef, label [[BB3:%.*]], label [[BB11:%.*]] ; CHECK: bb3: -; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i16> [[TMP0]], undef ; CHECK-NEXT: [[I4:%.*]] = zext i1 undef to i32 +; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i16> [[TMP0]], undef ; CHECK-NEXT: [[TMP2:%.*]] = icmp ugt <2 x i16> [[TMP1]], ; CHECK-NEXT: [[TMP3:%.*]] = zext <2 x i1> [[TMP2]] to <2 x i32> ; CHECK-NEXT: br label [[BB25]] ; CHECK: bb11: -; CHECK-NEXT: [[TMP4:%.*]] = xor <2 x i16> [[TMP0]], undef ; CHECK-NEXT: [[I12:%.*]] = zext i1 undef to i32 +; CHECK-NEXT: [[TMP4:%.*]] = xor <2 x i16> [[TMP0]], undef ; CHECK-NEXT: [[TMP5:%.*]] = sext <2 x i16> [[TMP4]] to <2 x i64> ; CHECK-NEXT: [[TMP6:%.*]] = icmp ule <2 x i64> undef, [[TMP5]] ; CHECK-NEXT: [[TMP7:%.*]] = zext <2 x i1> [[TMP6]] to <2 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll index d15494e..7f51dca 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR35628_2.ll @@ -9,11 +9,11 @@ define void @test() #0 { ; CHECK: loop: ; CHECK-NEXT: [[DUMMY_PHI:%.*]] = phi i64 [ 1, [[ENTRY:%.*]] ], [ [[OP_EXTRA1:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[TMP0:%.*]] = phi i64 [ 2, [[ENTRY]] ], [ [[TMP3:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[DUMMY_ADD:%.*]] = add i16 0, 0 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> poison, i64 [[TMP0]], i32 0 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i64> [[SHUFFLE]], ; CHECK-NEXT: [[TMP3]] = extractelement <4 x i64> [[TMP2]], i32 3 -; CHECK-NEXT: [[DUMMY_ADD:%.*]] = add i16 0, 0 ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i64> [[TMP2]], i32 0 ; CHECK-NEXT: [[DUMMY_SHL:%.*]] = shl i64 [[TMP4]], 32 ; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i64> , [[TMP2]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll index f878bda..7ab610f 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll @@ -10,10 +10,10 @@ define void @mainTest(i32 %param, i32 * %vals, i32 %len) { ; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP7:%.*]], [[BCI_15]] ], [ [[TMP0]], [[BCI_15_PREHEADER:%.*]] ] ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <16 x i32> [[SHUFFLE]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = add <16 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <16 x i32> [[SHUFFLE]], i32 15 -; CHECK-NEXT: store atomic i32 [[TMP4]], i32* [[VALS:%.*]] unordered, align 4 -; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP3]]) +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <16 x i32> [[SHUFFLE]], i32 15 +; CHECK-NEXT: store atomic i32 [[TMP3]], i32* [[VALS:%.*]] unordered, align 4 +; CHECK-NEXT: [[TMP4:%.*]] = add <16 x i32> [[SHUFFLE]], +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP4]]) ; CHECK-NEXT: [[OP_EXTRA:%.*]] = and i32 [[TMP5]], [[TMP2]] ; CHECK-NEXT: [[V44:%.*]] = add i32 [[TMP2]], 16 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[V44]], i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll b/llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll index f323c94..c0f453f 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/barriercall.ll @@ -7,12 +7,12 @@ target triple = "x86_64-apple-macosx10.8.0" define i32 @foo(i32* nocapture %A, i32 %n) { ; CHECK-LABEL: @foo( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[CALL:%.*]] = tail call i32 (...) @bar() ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> poison, i32 [[N:%.*]], i32 0 ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP1:%.*]] = mul nsw <4 x i32> [[SHUFFLE]], ; CHECK-NEXT: [[TMP2:%.*]] = shl <4 x i32> [[SHUFFLE]], ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> -; CHECK-NEXT: [[CALL:%.*]] = tail call i32 (...) @bar() ; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[TMP3]], ; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* ; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/consecutive-access.ll b/llvm/test/Transforms/SLPVectorizer/X86/consecutive-access.ll index 9359042..8f57fe6 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/consecutive-access.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/consecutive-access.ll @@ -511,9 +511,9 @@ define double @bar(double* nocapture readonly %a, i32 %n) local_unnamed_addr #0 ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[IDXPROM2]] ; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>* ; CHECK-NEXT: [[TMP5:%.*]] = load <2 x double>, <2 x double>* [[TMP4]], align 8 +; CHECK-NEXT: [[TMP6]] = fadd <2 x double> [[TMP3]], [[TMP5]] ; CHECK-NEXT: [[ADD5]] = add i32 [[I_018]], 2 ; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[ADD5]], [[N]] -; CHECK-NEXT: [[TMP6]] = fadd <2 x double> [[TMP3]], [[TMP5]] ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll index f6219f6..7b914b4 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll @@ -80,8 +80,8 @@ define void @testfunc(float* nocapture %dest, float* nocapture readonly %src) { ; AVX-NEXT: [[TMP16:%.*]] = fcmp olt <2 x float> [[TMP15]], ; AVX-NEXT: [[TMP17:%.*]] = select <2 x i1> [[TMP16]], <2 x float> [[TMP15]], <2 x float> ; AVX-NEXT: [[TMP18:%.*]] = fcmp olt <2 x float> [[TMP17]], -; AVX-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 32 ; AVX-NEXT: [[TMP19]] = select <2 x i1> [[TMP18]], <2 x float> , <2 x float> [[TMP17]] +; AVX-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 32 ; AVX-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] ; AVX: for.end: ; AVX-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll index 0fd6217..de371d8 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_exceed_scheduling.ll @@ -8,17 +8,16 @@ define void @exceed(double %0, double %1) { ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP0]], i32 1 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[TMP1:%.*]], i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = fdiv fast <2 x double> [[TMP3]], [[TMP5]] -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i32 1 -; CHECK-NEXT: [[IX:%.*]] = fmul double [[TMP8]], undef +; CHECK-NEXT: [[TMP6:%.*]] = fdiv fast <2 x double> [[TMP3]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x double> [[TMP6]], i32 1 +; CHECK-NEXT: [[IX:%.*]] = fmul double [[TMP7]], undef ; CHECK-NEXT: [[IXX0:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX1:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX2:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX3:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX4:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX5:%.*]] = fsub double undef, undef -; CHECK-NEXT: [[IX1:%.*]] = fmul double [[TMP8]], undef +; CHECK-NEXT: [[IX1:%.*]] = fmul double [[TMP7]], undef ; CHECK-NEXT: [[IXX10:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX11:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX12:%.*]] = fsub double undef, undef @@ -28,14 +27,15 @@ define void @exceed(double %0, double %1) { ; CHECK-NEXT: [[IXX20:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX21:%.*]] = fsub double undef, undef ; CHECK-NEXT: [[IXX22:%.*]] = fsub double undef, undef -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP7]], i32 0 -; CHECK-NEXT: [[IX2:%.*]] = fmul double [[TMP9]], [[TMP9]] -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP11:%.*]] = fadd fast <2 x double> [[TMP7]], [[TMP10]] +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP6]], i32 0 +; CHECK-NEXT: [[IX2:%.*]] = fmul double [[TMP8]], [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]] +; CHECK-NEXT: [[TMP12:%.*]] = fmul fast <2 x double> [[TMP10]], [[TMP11]] ; CHECK-NEXT: [[IXX101:%.*]] = fsub double undef, undef -; CHECK-NEXT: [[TMP12:%.*]] = fmul fast <2 x double> [[TMP11]], [[TMP6]] ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x double> poison, double [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x double> [[TMP13]], double [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x double> [[TMP13]], double [[TMP7]], i32 0 ; CHECK-NEXT: [[TMP15:%.*]] = fmul fast <2 x double> [[TMP14]], undef ; CHECK-NEXT: switch i32 undef, label [[BB1:%.*]] [ ; CHECK-NEXT: i32 0, label [[BB2:%.*]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cross_block_slp.ll b/llvm/test/Transforms/SLPVectorizer/X86/cross_block_slp.ll index 6c0116b..407cfbc 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/cross_block_slp.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/cross_block_slp.ll @@ -22,8 +22,8 @@ define i32 @foo(double* nocapture %A, float* nocapture %B, i32 %g) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[B:%.*]] to <2 x float>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* [[TMP0]], align 4 -; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[G:%.*]], 0 ; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x float> [[TMP1]], +; CHECK-NEXT: [[TOBOOL:%.*]] = icmp eq i32 [[G:%.*]], 0 ; CHECK-NEXT: br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]] ; CHECK: if.then: ; CHECK-NEXT: [[CALL:%.*]] = tail call i32 (...) @bar() diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cycle_dup.ll b/llvm/test/Transforms/SLPVectorizer/X86/cycle_dup.ll index 066f0ff..36aec7f 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/cycle_dup.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/cycle_dup.ll @@ -24,9 +24,9 @@ define i32 @foo(i32* nocapture %A) #0 { ; CHECK: for.body: ; CHECK-NEXT: [[I_029:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: [[TMP3:%.*]] = phi <4 x i32> [ [[TMP4:%.*]], [[FOR_BODY]] ], [ [[TMP1]], [[ENTRY]] ] +; CHECK-NEXT: [[TMP4]] = mul nsw <4 x i32> [[TMP3]], ; CHECK-NEXT: [[INC]] = add nsw i32 [[I_029]], 1 ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC]], [[TMP2]] -; CHECK-NEXT: [[TMP4]] = mul nsw <4 x i32> [[TMP3]], ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]] ; CHECK: for.end: ; CHECK-NEXT: [[TMP5:%.*]] = phi <4 x i32> [ [[TMP1]], [[ENTRY]] ], [ [[TMP4]], [[FOR_BODY]] ] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/external_user.ll b/llvm/test/Transforms/SLPVectorizer/X86/external_user.ll index df064c5..012ba4d 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/external_user.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/external_user.ll @@ -34,9 +34,9 @@ define double @ext_user(double* noalias nocapture %B, double* noalias nocapture ; CHECK-NEXT: [[TMP2:%.*]] = phi <2 x double> [ [[TMP1]], [[ENTRY]] ], [ [[TMP5:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], ; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP3]], +; CHECK-NEXT: [[TMP5]] = fadd <2 x double> [[TMP4]], ; CHECK-NEXT: [[INC]] = add nsw i32 [[I_020]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 100 -; CHECK-NEXT: [[TMP5]] = fadd <2 x double> [[TMP4]], ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] ; CHECK: for.end: ; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[B:%.*]] to <2 x double>* diff --git a/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll b/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll index f5ccb59..5965438 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll @@ -25,22 +25,22 @@ define dso_local i32 @g() local_unnamed_addr { ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32*> [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint i32* [[TMP5]] to i64 ; CHECK-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i32*> [[TMP4]], i32 1 -; CHECK-NEXT: store i32 [[TMP7]], i32* [[TMP8]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, <2 x i32*> [[TMP1]], <2 x i64> +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32*> [[TMP4]], i32 1 +; CHECK-NEXT: store i32 [[TMP7]], i32* [[TMP9]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[C_022]], i64 2 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i32, <2 x i32*> [[TMP1]], <2 x i64> ; CHECK-NEXT: br label [[WHILE_BODY_BACKEDGE]] ; CHECK: sw.bb6: ; CHECK-NEXT: [[INCDEC_PTR8:%.*]] = getelementptr inbounds i32, i32* [[C_022]], i64 2 ; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint i32* [[INCDEC_PTR]] to i64 ; CHECK-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP10]] to i32 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i32*> [[TMP4]], i32 0 -; CHECK-NEXT: store i32 [[TMP11]], i32* [[TMP12]], align 4 -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, <2 x i32*> [[TMP1]], <2 x i64> +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, <2 x i32*> [[TMP1]], <2 x i64> +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i32*> [[TMP4]], i32 0 +; CHECK-NEXT: store i32 [[TMP11]], i32* [[TMP13]], align 4 ; CHECK-NEXT: br label [[WHILE_BODY_BACKEDGE]] ; CHECK: while.body.backedge: ; CHECK-NEXT: [[C_022_BE]] = phi i32* [ [[INCDEC_PTR]], [[WHILE_BODY]] ], [ [[INCDEC_PTR8]], [[SW_BB6]] ], [ [[INCDEC_PTR5]], [[SW_BB]] ] -; CHECK-NEXT: [[TMP14]] = phi <2 x i32*> [ [[TMP4]], [[WHILE_BODY]] ], [ [[TMP13]], [[SW_BB6]] ], [ [[TMP9]], [[SW_BB]] ] +; CHECK-NEXT: [[TMP14]] = phi <2 x i32*> [ [[TMP4]], [[WHILE_BODY]] ], [ [[TMP12]], [[SW_BB6]] ], [ [[TMP8]], [[SW_BB]] ] ; CHECK-NEXT: br label [[WHILE_BODY]] ; CHECK: while.end: ; CHECK-NEXT: ret i32 undef diff --git a/llvm/test/Transforms/SLPVectorizer/X86/multi_block.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi_block.ll index 2d05316..1b224cb 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/multi_block.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/multi_block.ll @@ -21,14 +21,14 @@ define i32 @bar(double* nocapture %A, i32 %d) { ; CHECK-LABEL: @bar( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[A:%.*]] to <2 x double>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8 -; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i32 [[D:%.*]], 0 -; CHECK-NEXT: [[TMP4:%.*]] = fptrunc <2 x double> [[TMP2]] to <2 x float> -; CHECK-NEXT: br i1 [[TMP3]], label [[TMP7:%.*]], label [[TMP5:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = fptrunc <2 x double> [[TMP2]] to <2 x float> +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[D:%.*]], 0 +; CHECK-NEXT: br i1 [[TMP4]], label [[TMP7:%.*]], label [[TMP5:%.*]] ; CHECK: 5: ; CHECK-NEXT: [[TMP6:%.*]] = tail call i32 (...) @foo() ; CHECK-NEXT: br label [[TMP7]] ; CHECK: 7: -; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP4]], +; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP3]], ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds double, double* [[A]], i64 8 ; CHECK-NEXT: [[TMP10:%.*]] = fpext <2 x float> [[TMP8]] to <2 x double> ; CHECK-NEXT: [[TMP11:%.*]] = fadd <2 x double> [[TMP10]], diff --git a/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll b/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll index 8dc4a89..80cb197 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll @@ -58,10 +58,10 @@ define void @test(ptr %r, ptr %p, ptr %q) #0 { define void @test2(i64* %a, i64* %b) { ; CHECK-LABEL: @test2( -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[A:%.*]], i32 0 +; CHECK-NEXT: [[A2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 2 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[A]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x ptr> [[TMP1]], ptr [[B:%.*]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i64, <2 x ptr> [[TMP2]], <2 x i64> -; CHECK-NEXT: [[A2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 2 ; CHECK-NEXT: [[TMP4:%.*]] = ptrtoint <2 x ptr> [[TMP3]] to <2 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll index e3e4fdf..df76725 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll @@ -83,9 +83,9 @@ define i32 @foo2(double* noalias nocapture %B, double* noalias nocapture %A, i32 ; CHECK-NEXT: [[TMP2:%.*]] = phi <2 x double> [ [[TMP1]], [[ENTRY]] ], [ [[TMP5:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], ; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> [[TMP3]], +; CHECK-NEXT: [[TMP5]] = fadd <2 x double> [[TMP4]], ; CHECK-NEXT: [[INC]] = add nsw i32 [[I_019]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 100 -; CHECK-NEXT: [[TMP5]] = fadd <2 x double> [[TMP4]], ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] ; CHECK: for.end: ; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[B:%.*]] to <2 x double>* @@ -151,7 +151,7 @@ define float @foo3(float* nocapture readonly %A) #0 { ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[R_052:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = phi <4 x float> [ [[TMP2]], [[ENTRY]] ], [ [[TMP20:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = phi <4 x float> [ [[TMP2]], [[ENTRY]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[TMP7:%.*]] = phi <2 x float> [ [[TMP5]], [[ENTRY]] ], [ [[TMP12:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x float> [[TMP7]], i32 0 ; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP8]], 7.000000e+00 @@ -169,18 +169,18 @@ define float @foo3(float* nocapture readonly %A) #0 { ; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x float> [[TMP15]], <4 x float> [[TMP16]], <4 x i32> ; CHECK-NEXT: [[TMP18:%.*]] = fmul <4 x float> [[TMP17]], -; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP19]], 121 -; CHECK-NEXT: [[TMP20]] = fadd <4 x float> [[TMP6]], [[TMP18]] +; CHECK-NEXT: [[TMP19]] = fadd <4 x float> [[TMP6]], [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP20]], 121 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]] ; CHECK: for.end: -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x float> [[TMP20]], i32 0 +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x float> [[TMP19]], i32 0 ; CHECK-NEXT: [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP21]] -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x float> [[TMP20]], i32 1 +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x float> [[TMP19]], i32 1 ; CHECK-NEXT: [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP22]] -; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x float> [[TMP20]], i32 2 +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x float> [[TMP19]], i32 2 ; CHECK-NEXT: [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP23]] -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x float> [[TMP20]], i32 3 +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x float> [[TMP19]], i32 3 ; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP24]] ; CHECK-NEXT: ret float [[ADD31]] ; @@ -245,8 +245,6 @@ define float @sort_phi_type(float* nocapture readonly %A) { ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = phi <4 x float> [ , [[ENTRY]] ], [ [[TMP9:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], 4 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT]], 128 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[TMP0]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> poison, float [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP0]], i32 1 @@ -256,6 +254,8 @@ define float @sort_phi_type(float* nocapture readonly %A) { ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP0]], i32 2 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[TMP6]], float [[TMP7]], i32 3 ; CHECK-NEXT: [[TMP9]] = fmul <4 x float> [[TMP8]], +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], 4 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT]], 128 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]] ; CHECK: for.end: ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[TMP9]], i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll index 3ac598f..f6dd752 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll @@ -299,17 +299,17 @@ define void @gather_load_3(i32* noalias nocapture %0, i32* noalias nocapture rea ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_3( -; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1:%.*]], i64 0 -; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> poison, <4 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP4:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> -; AVX512VL-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], 1 -; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 -; AVX512VL-NEXT: store i32 [[TMP6]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP8:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP4]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1 +; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 +; AVX512VL-NEXT: store i32 [[TMP4]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP6:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i64 0 +; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> poison, <4 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> +; AVX512VL-NEXT: [[TMP8:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP7]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] ; AVX512VL-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP8]], ; AVX512VL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 -; AVX512VL-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>* +; AVX512VL-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* ; AVX512VL-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* [[TMP11]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 ; AVX512VL-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4, !tbaa [[TBAA0]] @@ -510,10 +510,10 @@ define void @gather_load_4(i32* noalias nocapture %t0, i32* noalias nocapture re ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_4( +; AVX512VL-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1 ; AVX512VL-NEXT: [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i64 0 ; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> poison, <4 x i32> zeroinitializer ; AVX512VL-NEXT: [[TMP2:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> -; AVX512VL-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1 ; AVX512VL-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5 ; AVX512VL-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 ; AVX512VL-NEXT: [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6 @@ -749,47 +749,47 @@ define void @gather_load_div(float* noalias nocapture %0, float* noalias nocaptu ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_div( -; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <2 x float*> poison, float* [[TMP1:%.*]], i64 0 -; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <2 x float*> [[TMP3]], <2 x float*> poison, <2 x i32> zeroinitializer -; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr float, <2 x float*> [[TMP4]], <2 x i64> -; AVX512F-NEXT: [[TMP6:%.*]] = insertelement <4 x float*> poison, float* [[TMP1]], i64 0 -; AVX512F-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x float*> [[TMP6]], <4 x float*> poison, <4 x i32> zeroinitializer -; AVX512F-NEXT: [[TMP7:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE1]], <4 x i64> -; AVX512F-NEXT: [[TMP8:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i64 0 -; AVX512F-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float*> [[TMP8]], <8 x float*> poison, <8 x i32> zeroinitializer -; AVX512F-NEXT: [[TMP9:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE]], <8 x i64> -; AVX512F-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 -; AVX512F-NEXT: [[TMP11:%.*]] = shufflevector <4 x float*> [[TMP7]], <4 x float*> poison, <8 x i32> -; AVX512F-NEXT: [[TMP12:%.*]] = shufflevector <8 x float*> [[TMP8]], <8 x float*> [[TMP11]], <8 x i32> -; AVX512F-NEXT: [[TMP13:%.*]] = shufflevector <2 x float*> [[TMP5]], <2 x float*> poison, <8 x i32> -; AVX512F-NEXT: [[TMP14:%.*]] = shufflevector <8 x float*> [[TMP12]], <8 x float*> [[TMP13]], <8 x i32> -; AVX512F-NEXT: [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP10]], i64 7 -; AVX512F-NEXT: [[TMP16:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP15]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP17:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP9]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP18:%.*]] = fdiv <8 x float> [[TMP16]], [[TMP17]] +; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <4 x float*> poison, float* [[TMP1:%.*]], i64 0 +; AVX512F-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x float*> [[TMP3]], <4 x float*> poison, <4 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP4:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE1]], <4 x i64> +; AVX512F-NEXT: [[TMP5:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i64 0 +; AVX512F-NEXT: [[TMP6:%.*]] = shufflevector <2 x float*> [[TMP5]], <2 x float*> poison, <2 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP7:%.*]] = getelementptr float, <2 x float*> [[TMP6]], <2 x i64> +; AVX512F-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 +; AVX512F-NEXT: [[TMP9:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i64 0 +; AVX512F-NEXT: [[TMP10:%.*]] = shufflevector <4 x float*> [[TMP4]], <4 x float*> poison, <8 x i32> +; AVX512F-NEXT: [[TMP11:%.*]] = shufflevector <8 x float*> [[TMP9]], <8 x float*> [[TMP10]], <8 x i32> +; AVX512F-NEXT: [[TMP12:%.*]] = shufflevector <2 x float*> [[TMP7]], <2 x float*> poison, <8 x i32> +; AVX512F-NEXT: [[TMP13:%.*]] = shufflevector <8 x float*> [[TMP11]], <8 x float*> [[TMP12]], <8 x i32> +; AVX512F-NEXT: [[TMP14:%.*]] = insertelement <8 x float*> [[TMP13]], float* [[TMP8]], i64 7 +; AVX512F-NEXT: [[TMP15:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP14]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float*> [[TMP9]], <8 x float*> poison, <8 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP16:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE]], <8 x i64> +; AVX512F-NEXT: [[TMP17:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP16]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP18:%.*]] = fdiv <8 x float> [[TMP15]], [[TMP17]] ; AVX512F-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* ; AVX512F-NEXT: store <8 x float> [[TMP18]], <8 x float>* [[TMP19]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_div( -; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <2 x float*> poison, float* [[TMP1:%.*]], i64 0 -; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <2 x float*> [[TMP3]], <2 x float*> poison, <2 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr float, <2 x float*> [[TMP4]], <2 x i64> -; AVX512VL-NEXT: [[TMP6:%.*]] = insertelement <4 x float*> poison, float* [[TMP1]], i64 0 -; AVX512VL-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x float*> [[TMP6]], <4 x float*> poison, <4 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE1]], <4 x i64> -; AVX512VL-NEXT: [[TMP8:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i64 0 -; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float*> [[TMP8]], <8 x float*> poison, <8 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP9:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE]], <8 x i64> -; AVX512VL-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 -; AVX512VL-NEXT: [[TMP11:%.*]] = shufflevector <4 x float*> [[TMP7]], <4 x float*> poison, <8 x i32> -; AVX512VL-NEXT: [[TMP12:%.*]] = shufflevector <8 x float*> [[TMP8]], <8 x float*> [[TMP11]], <8 x i32> -; AVX512VL-NEXT: [[TMP13:%.*]] = shufflevector <2 x float*> [[TMP5]], <2 x float*> poison, <8 x i32> -; AVX512VL-NEXT: [[TMP14:%.*]] = shufflevector <8 x float*> [[TMP12]], <8 x float*> [[TMP13]], <8 x i32> -; AVX512VL-NEXT: [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP10]], i64 7 -; AVX512VL-NEXT: [[TMP16:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP15]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP17:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP9]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP18:%.*]] = fdiv <8 x float> [[TMP16]], [[TMP17]] +; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <4 x float*> poison, float* [[TMP1:%.*]], i64 0 +; AVX512VL-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x float*> [[TMP3]], <4 x float*> poison, <4 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP4:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE1]], <4 x i64> +; AVX512VL-NEXT: [[TMP5:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i64 0 +; AVX512VL-NEXT: [[TMP6:%.*]] = shufflevector <2 x float*> [[TMP5]], <2 x float*> poison, <2 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr float, <2 x float*> [[TMP6]], <2 x i64> +; AVX512VL-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 +; AVX512VL-NEXT: [[TMP9:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i64 0 +; AVX512VL-NEXT: [[TMP10:%.*]] = shufflevector <4 x float*> [[TMP4]], <4 x float*> poison, <8 x i32> +; AVX512VL-NEXT: [[TMP11:%.*]] = shufflevector <8 x float*> [[TMP9]], <8 x float*> [[TMP10]], <8 x i32> +; AVX512VL-NEXT: [[TMP12:%.*]] = shufflevector <2 x float*> [[TMP7]], <2 x float*> poison, <8 x i32> +; AVX512VL-NEXT: [[TMP13:%.*]] = shufflevector <8 x float*> [[TMP11]], <8 x float*> [[TMP12]], <8 x i32> +; AVX512VL-NEXT: [[TMP14:%.*]] = insertelement <8 x float*> [[TMP13]], float* [[TMP8]], i64 7 +; AVX512VL-NEXT: [[TMP15:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP14]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float*> [[TMP9]], <8 x float*> poison, <8 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP16:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE]], <8 x i64> +; AVX512VL-NEXT: [[TMP17:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP16]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP18:%.*]] = fdiv <8 x float> [[TMP15]], [[TMP17]] ; AVX512VL-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* ; AVX512VL-NEXT: store <8 x float> [[TMP18]], <8 x float>* [[TMP19]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll index 19fe257..fd1c612 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll @@ -299,17 +299,17 @@ define void @gather_load_3(i32* noalias nocapture %0, i32* noalias nocapture rea ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_3( -; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1:%.*]], i64 0 -; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP3]], <4 x i32*> poison, <4 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP4:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> -; AVX512VL-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP1]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], 1 -; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 -; AVX512VL-NEXT: store i32 [[TMP6]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP8:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP4]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1 +; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 +; AVX512VL-NEXT: store i32 [[TMP4]], i32* [[TMP0]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP6:%.*]] = insertelement <4 x i32*> poison, i32* [[TMP1]], i64 0 +; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> poison, <4 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> +; AVX512VL-NEXT: [[TMP8:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP7]], i32 4, <4 x i1> , <4 x i32> undef), !tbaa [[TBAA0]] ; AVX512VL-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP8]], ; AVX512VL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 -; AVX512VL-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>* +; AVX512VL-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* ; AVX512VL-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* [[TMP11]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 ; AVX512VL-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4, !tbaa [[TBAA0]] @@ -510,10 +510,10 @@ define void @gather_load_4(i32* noalias nocapture %t0, i32* noalias nocapture re ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_4( +; AVX512VL-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1 ; AVX512VL-NEXT: [[TMP1:%.*]] = insertelement <4 x i32*> poison, i32* [[T1:%.*]], i64 0 ; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> poison, <4 x i32> zeroinitializer ; AVX512VL-NEXT: [[TMP2:%.*]] = getelementptr i32, <4 x i32*> [[SHUFFLE]], <4 x i64> -; AVX512VL-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1 ; AVX512VL-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5 ; AVX512VL-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 ; AVX512VL-NEXT: [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6 @@ -749,47 +749,47 @@ define void @gather_load_div(float* noalias nocapture %0, float* noalias nocaptu ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_div( -; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <2 x float*> poison, float* [[TMP1:%.*]], i64 0 -; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <2 x float*> [[TMP3]], <2 x float*> poison, <2 x i32> zeroinitializer -; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr float, <2 x float*> [[TMP4]], <2 x i64> -; AVX512F-NEXT: [[TMP6:%.*]] = insertelement <4 x float*> poison, float* [[TMP1]], i64 0 -; AVX512F-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x float*> [[TMP6]], <4 x float*> poison, <4 x i32> zeroinitializer -; AVX512F-NEXT: [[TMP7:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE1]], <4 x i64> -; AVX512F-NEXT: [[TMP8:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i64 0 -; AVX512F-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float*> [[TMP8]], <8 x float*> poison, <8 x i32> zeroinitializer -; AVX512F-NEXT: [[TMP9:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE]], <8 x i64> -; AVX512F-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 -; AVX512F-NEXT: [[TMP11:%.*]] = shufflevector <4 x float*> [[TMP7]], <4 x float*> poison, <8 x i32> -; AVX512F-NEXT: [[TMP12:%.*]] = shufflevector <8 x float*> [[TMP8]], <8 x float*> [[TMP11]], <8 x i32> -; AVX512F-NEXT: [[TMP13:%.*]] = shufflevector <2 x float*> [[TMP5]], <2 x float*> poison, <8 x i32> -; AVX512F-NEXT: [[TMP14:%.*]] = shufflevector <8 x float*> [[TMP12]], <8 x float*> [[TMP13]], <8 x i32> -; AVX512F-NEXT: [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP10]], i64 7 -; AVX512F-NEXT: [[TMP16:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP15]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP17:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP9]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP18:%.*]] = fdiv <8 x float> [[TMP16]], [[TMP17]] +; AVX512F-NEXT: [[TMP3:%.*]] = insertelement <4 x float*> poison, float* [[TMP1:%.*]], i64 0 +; AVX512F-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x float*> [[TMP3]], <4 x float*> poison, <4 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP4:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE1]], <4 x i64> +; AVX512F-NEXT: [[TMP5:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i64 0 +; AVX512F-NEXT: [[TMP6:%.*]] = shufflevector <2 x float*> [[TMP5]], <2 x float*> poison, <2 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP7:%.*]] = getelementptr float, <2 x float*> [[TMP6]], <2 x i64> +; AVX512F-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 +; AVX512F-NEXT: [[TMP9:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i64 0 +; AVX512F-NEXT: [[TMP10:%.*]] = shufflevector <4 x float*> [[TMP4]], <4 x float*> poison, <8 x i32> +; AVX512F-NEXT: [[TMP11:%.*]] = shufflevector <8 x float*> [[TMP9]], <8 x float*> [[TMP10]], <8 x i32> +; AVX512F-NEXT: [[TMP12:%.*]] = shufflevector <2 x float*> [[TMP7]], <2 x float*> poison, <8 x i32> +; AVX512F-NEXT: [[TMP13:%.*]] = shufflevector <8 x float*> [[TMP11]], <8 x float*> [[TMP12]], <8 x i32> +; AVX512F-NEXT: [[TMP14:%.*]] = insertelement <8 x float*> [[TMP13]], float* [[TMP8]], i64 7 +; AVX512F-NEXT: [[TMP15:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP14]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float*> [[TMP9]], <8 x float*> poison, <8 x i32> zeroinitializer +; AVX512F-NEXT: [[TMP16:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE]], <8 x i64> +; AVX512F-NEXT: [[TMP17:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP16]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] +; AVX512F-NEXT: [[TMP18:%.*]] = fdiv <8 x float> [[TMP15]], [[TMP17]] ; AVX512F-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* ; AVX512F-NEXT: store <8 x float> [[TMP18]], <8 x float>* [[TMP19]], align 4, !tbaa [[TBAA0]] ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_div( -; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <2 x float*> poison, float* [[TMP1:%.*]], i64 0 -; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <2 x float*> [[TMP3]], <2 x float*> poison, <2 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr float, <2 x float*> [[TMP4]], <2 x i64> -; AVX512VL-NEXT: [[TMP6:%.*]] = insertelement <4 x float*> poison, float* [[TMP1]], i64 0 -; AVX512VL-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x float*> [[TMP6]], <4 x float*> poison, <4 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE1]], <4 x i64> -; AVX512VL-NEXT: [[TMP8:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i64 0 -; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float*> [[TMP8]], <8 x float*> poison, <8 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP9:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE]], <8 x i64> -; AVX512VL-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 -; AVX512VL-NEXT: [[TMP11:%.*]] = shufflevector <4 x float*> [[TMP7]], <4 x float*> poison, <8 x i32> -; AVX512VL-NEXT: [[TMP12:%.*]] = shufflevector <8 x float*> [[TMP8]], <8 x float*> [[TMP11]], <8 x i32> -; AVX512VL-NEXT: [[TMP13:%.*]] = shufflevector <2 x float*> [[TMP5]], <2 x float*> poison, <8 x i32> -; AVX512VL-NEXT: [[TMP14:%.*]] = shufflevector <8 x float*> [[TMP12]], <8 x float*> [[TMP13]], <8 x i32> -; AVX512VL-NEXT: [[TMP15:%.*]] = insertelement <8 x float*> [[TMP14]], float* [[TMP10]], i64 7 -; AVX512VL-NEXT: [[TMP16:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP15]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP17:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP9]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP18:%.*]] = fdiv <8 x float> [[TMP16]], [[TMP17]] +; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <4 x float*> poison, float* [[TMP1:%.*]], i64 0 +; AVX512VL-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x float*> [[TMP3]], <4 x float*> poison, <4 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP4:%.*]] = getelementptr float, <4 x float*> [[SHUFFLE1]], <4 x i64> +; AVX512VL-NEXT: [[TMP5:%.*]] = insertelement <2 x float*> poison, float* [[TMP1]], i64 0 +; AVX512VL-NEXT: [[TMP6:%.*]] = shufflevector <2 x float*> [[TMP5]], <2 x float*> poison, <2 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr float, <2 x float*> [[TMP6]], <2 x i64> +; AVX512VL-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, float* [[TMP1]], i64 20 +; AVX512VL-NEXT: [[TMP9:%.*]] = insertelement <8 x float*> poison, float* [[TMP1]], i64 0 +; AVX512VL-NEXT: [[TMP10:%.*]] = shufflevector <4 x float*> [[TMP4]], <4 x float*> poison, <8 x i32> +; AVX512VL-NEXT: [[TMP11:%.*]] = shufflevector <8 x float*> [[TMP9]], <8 x float*> [[TMP10]], <8 x i32> +; AVX512VL-NEXT: [[TMP12:%.*]] = shufflevector <2 x float*> [[TMP7]], <2 x float*> poison, <8 x i32> +; AVX512VL-NEXT: [[TMP13:%.*]] = shufflevector <8 x float*> [[TMP11]], <8 x float*> [[TMP12]], <8 x i32> +; AVX512VL-NEXT: [[TMP14:%.*]] = insertelement <8 x float*> [[TMP13]], float* [[TMP8]], i64 7 +; AVX512VL-NEXT: [[TMP15:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP14]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float*> [[TMP9]], <8 x float*> poison, <8 x i32> zeroinitializer +; AVX512VL-NEXT: [[TMP16:%.*]] = getelementptr float, <8 x float*> [[SHUFFLE]], <8 x i64> +; AVX512VL-NEXT: [[TMP17:%.*]] = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> [[TMP16]], i32 4, <8 x i1> , <8 x float> undef), !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP18:%.*]] = fdiv <8 x float> [[TMP15]], [[TMP17]] ; AVX512VL-NEXT: [[TMP19:%.*]] = bitcast float* [[TMP0:%.*]] to <8 x float>* ; AVX512VL-NEXT: store <8 x float> [[TMP18]], <8 x float>* [[TMP19]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll index e2cc14a..6e434fe 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll @@ -6,12 +6,12 @@ target triple = "x86_64-unknown-linux-gnu" define <4 x i32> @foo(<4 x i32> %x, i32 %f) { ; CHECK-LABEL: @foo( -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[F:%.*]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], -; CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x i32> undef, i32 [[F]], i64 0 +; CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x i32> undef, i32 [[F:%.*]], i64 0 ; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[F]], 1 ; CHECK-NEXT: [[VECINIT1:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[ADD]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[F]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: [[VECINIT51:%.*]] = shufflevector <4 x i32> [[VECINIT1]], <4 x i32> [[TMP4]], <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[VECINIT51]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll index 7294b22..4301c3f 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll @@ -33,30 +33,30 @@ define float @foo(float* nocapture readonly %A) { ; CHECK-NEXT: [[TMP4:%.*]] = phi float [ [[TMP3]], [[ENTRY:%.*]] ], [ [[DOTPRE:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE:%.*]] ] ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] ; CHECK-NEXT: [[B_032:%.*]] = phi float [ [[TMP2]], [[ENTRY]] ], [ [[ADD14:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] -; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x float> [ [[TMP1]], [[ENTRY]] ], [ [[TMP14:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] +; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x float> [ [[TMP1]], [[ENTRY]] ], [ [[TMP11:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] ; CHECK-NEXT: [[TMP6:%.*]] = add nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP6]] ; CHECK-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX7]], align 4 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x float> poison, float [[TMP4]], i32 0 ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x float> [[TMP8]], float [[TMP7]], i32 1 ; CHECK-NEXT: [[TMP10:%.*]] = fmul <2 x float> [[TMP9]], -; CHECK-NEXT: [[TMP11:%.*]] = add nsw i64 [[INDVARS_IV]], 2 -; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP12:%.*]] = load float, float* [[ARRAYIDX12]], align 4 -; CHECK-NEXT: [[MUL13:%.*]] = fmul float [[TMP12]], 9.000000e+00 +; CHECK-NEXT: [[TMP11]] = fadd <2 x float> [[TMP5]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = add nsw i64 [[INDVARS_IV]], 2 +; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = load float, float* [[ARRAYIDX12]], align 4 +; CHECK-NEXT: [[MUL13:%.*]] = fmul float [[TMP13]], 9.000000e+00 ; CHECK-NEXT: [[ADD14]] = fadd float [[B_032]], [[MUL13]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 3 -; CHECK-NEXT: [[TMP13:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP13]], 121 -; CHECK-NEXT: [[TMP14]] = fadd <2 x float> [[TMP5]], [[TMP10]] +; CHECK-NEXT: [[TMP14:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP14]], 121 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY_FOR_BODY_CRIT_EDGE]], label [[FOR_END:%.*]] ; CHECK: for.body.for.body_crit_edge: ; CHECK-NEXT: [[ARRAYIDX3_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT]] ; CHECK-NEXT: [[DOTPRE]] = load float, float* [[ARRAYIDX3_PHI_TRANS_INSERT]], align 4 ; CHECK-NEXT: br label [[FOR_BODY]] ; CHECK: for.end: -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x float> [[TMP14]], i32 0 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP14]], i32 1 +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x float> [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP11]], i32 1 ; CHECK-NEXT: [[ADD16:%.*]] = fadd float [[TMP15]], [[TMP16]] ; CHECK-NEXT: [[ADD17:%.*]] = fadd float [[ADD16]], [[ADD14]] ; CHECK-NEXT: ret float [[ADD17]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder2.ll b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder2.ll index 6946ab2..a4a388e 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder2.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder2.ll @@ -21,11 +21,11 @@ define void @foo(%class.e* %this, %struct.a* %p, i32 %add7) { ; CHECK-NEXT: i32 2, label [[SW_BB]] ; CHECK-NEXT: ] ; CHECK: sw.bb: +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[G]] to <2 x i32>* +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i32>, <2 x i32>* [[TMP2]], align 4 ; CHECK-NEXT: [[SHRINK_SHUFFLE:%.*]] = shufflevector <4 x i32> [[SHUFFLE]], <4 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = xor <2 x i32> [[SHRINK_SHUFFLE]], -; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[G]] to <2 x i32>* -; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[TMP4]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = xor <2 x i32> [[SHRINK_SHUFFLE]], +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[TMP3]], [[TMP4]] ; CHECK-NEXT: br label [[SW_EPILOG]] ; CHECK: sw.epilog: ; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x i32> [ undef, [[ENTRY:%.*]] ], [ [[TMP5]], [[SW_BB]] ] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll index ade84eb..0188dd7 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/sitofp-inseltpoison.ll @@ -1065,14 +1065,14 @@ define void @sitofp_16i8_16f32() #0 { define <4 x double> @sitofp_4xi32_4f64(i32 %a0, i32 %a1, i32 %a2, i32 %a3) #0 { ; SSE-LABEL: @sitofp_4xi32_4f64( -; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[A2:%.*]], i32 0 -; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[A3:%.*]], i32 1 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[A0:%.*]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[A1:%.*]], i32 1 ; SSE-NEXT: [[TMP3:%.*]] = sitofp <2 x i32> [[TMP2]] to <2 x double> -; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[A0:%.*]], i32 0 -; SSE-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[A1:%.*]], i32 1 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[A2:%.*]], i32 0 +; SSE-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[A3:%.*]], i32 1 ; SSE-NEXT: [[TMP6:%.*]] = sitofp <2 x i32> [[TMP5]] to <2 x double> -; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> -; SSE-NEXT: [[TMP8:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> +; SSE-NEXT: [[TMP8:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> ; SSE-NEXT: [[RES31:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP8]], <4 x i32> ; SSE-NEXT: ret <4 x double> [[RES31]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll b/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll index 8c68584..00e077b 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/sitofp.ll @@ -1065,14 +1065,14 @@ define void @sitofp_16i8_16f32() #0 { define <4 x double> @sitofp_4xi32_4f64(i32 %a0, i32 %a1, i32 %a2, i32 %a3) #0 { ; SSE-LABEL: @sitofp_4xi32_4f64( -; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[A2:%.*]], i32 0 -; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[A3:%.*]], i32 1 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[A0:%.*]], i32 0 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[A1:%.*]], i32 1 ; SSE-NEXT: [[TMP3:%.*]] = sitofp <2 x i32> [[TMP2]] to <2 x double> -; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[A0:%.*]], i32 0 -; SSE-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[A1:%.*]], i32 1 +; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> poison, i32 [[A2:%.*]], i32 0 +; SSE-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[A3:%.*]], i32 1 ; SSE-NEXT: [[TMP6:%.*]] = sitofp <2 x i32> [[TMP5]] to <2 x double> -; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> -; SSE-NEXT: [[TMP8:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> +; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> +; SSE-NEXT: [[TMP8:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <4 x i32> ; SSE-NEXT: [[RES31:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> [[TMP8]], <4 x i32> ; SSE-NEXT: ret <4 x double> [[RES31]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll b/llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll index 71fb5cd..f773910 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll @@ -19,15 +19,15 @@ define i32 @non-ordered-stores(i32* noalias nocapture %in, i32* noalias nocaptur ; CHECK-NEXT: [[LOAD_7:%.*]] = load i32, i32* [[GEP_5]], align 4 ; CHECK-NEXT: [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3 ; CHECK-NEXT: [[LOAD_8:%.*]] = load i32, i32* [[GEP_6]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_2]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[LOAD_4]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_6]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[LOAD_8]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_1]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[LOAD_3]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_5]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[LOAD_7]], i32 1 ; CHECK-NEXT: [[TMP5:%.*]] = mul <2 x i32> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_1]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[LOAD_3]], i32 1 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_5]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[LOAD_7]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_2]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP6]], i32 [[LOAD_4]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> poison, i32 [[LOAD_6]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> [[TMP8]], i32 [[LOAD_8]], i32 1 ; CHECK-NEXT: [[TMP10:%.*]] = mul <2 x i32> [[TMP7]], [[TMP9]] ; CHECK-NEXT: br label [[BLOCK1:%.*]] ; CHECK: block1: @@ -42,9 +42,9 @@ define i32 @non-ordered-stores(i32* noalias nocapture %in, i32* noalias nocaptur ; CHECK-NEXT: [[GEP_11:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 4 ; CHECK-NEXT: store i32 [[LOAD_9]], i32* [[GEP_9]], align 4 ; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[GEP_10]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[TMP10]], <2 x i32>* [[TMP11]], align 4 +; CHECK-NEXT: store <2 x i32> [[TMP5]], <2 x i32>* [[TMP11]], align 4 ; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[GEP_7]] to <2 x i32>* -; CHECK-NEXT: store <2 x i32> [[TMP5]], <2 x i32>* [[TMP12]], align 4 +; CHECK-NEXT: store <2 x i32> [[TMP10]], <2 x i32>* [[TMP12]], align 4 ; CHECK-NEXT: ret i32 undef ; %in.addr = getelementptr inbounds i32, i32* %in, i64 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll index 109c27e..87709a8 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-widest-phis.ll @@ -16,8 +16,8 @@ define void @foo() { ; CHECK-NEXT: [[TMP3:%.*]] = load double, double* undef, align 8 ; CHECK-NEXT: br i1 undef, label [[BB3]], label [[BB4:%.*]] ; CHECK: bb4: -; CHECK-NEXT: [[TMP4:%.*]] = fpext <4 x float> [[TMP2]] to <4 x double> ; CHECK-NEXT: [[CONV2:%.*]] = uitofp i16 undef to double +; CHECK-NEXT: [[TMP4:%.*]] = fpext <4 x float> [[TMP2]] to <4 x double> ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> , double [[TMP3]], i32 1 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> , double [[CONV2]], i32 1 ; CHECK-NEXT: [[TMP7:%.*]] = fsub <2 x double> [[TMP5]], [[TMP6]] diff --git a/llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll b/llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll index cc2ee73..33ba979 100644 --- a/llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll +++ b/llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll @@ -133,27 +133,27 @@ define void @phi_float32(half %hval, float %fval) { ; MAX256-NEXT: br label [[BB1:%.*]] ; MAX256: bb1: ; MAX256-NEXT: [[I:%.*]] = fpext half [[HVAL:%.*]] to float +; MAX256-NEXT: [[I3:%.*]] = fpext half [[HVAL]] to float +; MAX256-NEXT: [[I6:%.*]] = fpext half [[HVAL]] to float +; MAX256-NEXT: [[I9:%.*]] = fpext half [[HVAL]] to float ; MAX256-NEXT: [[TMP0:%.*]] = insertelement <8 x float> poison, float [[I]], i32 0 ; MAX256-NEXT: [[SHUFFLE11:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <8 x i32> zeroinitializer ; MAX256-NEXT: [[TMP1:%.*]] = insertelement <8 x float> poison, float [[FVAL:%.*]], i32 0 ; MAX256-NEXT: [[SHUFFLE12:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <8 x i32> zeroinitializer ; MAX256-NEXT: [[TMP2:%.*]] = fmul <8 x float> [[SHUFFLE11]], [[SHUFFLE12]] -; MAX256-NEXT: [[I3:%.*]] = fpext half [[HVAL]] to float -; MAX256-NEXT: [[TMP3:%.*]] = insertelement <8 x float> poison, float [[I3]], i32 0 -; MAX256-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <8 x i32> zeroinitializer -; MAX256-NEXT: [[TMP4:%.*]] = fmul <8 x float> [[SHUFFLE]], [[SHUFFLE12]] -; MAX256-NEXT: [[I6:%.*]] = fpext half [[HVAL]] to float -; MAX256-NEXT: [[TMP5:%.*]] = insertelement <8 x float> poison, float [[I6]], i32 0 -; MAX256-NEXT: [[SHUFFLE5:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <8 x i32> zeroinitializer -; MAX256-NEXT: [[TMP6:%.*]] = fmul <8 x float> [[SHUFFLE5]], [[SHUFFLE12]] -; MAX256-NEXT: [[I9:%.*]] = fpext half [[HVAL]] to float -; MAX256-NEXT: [[TMP7:%.*]] = insertelement <8 x float> poison, float [[I9]], i32 0 -; MAX256-NEXT: [[SHUFFLE8:%.*]] = shufflevector <8 x float> [[TMP7]], <8 x float> poison, <8 x i32> zeroinitializer -; MAX256-NEXT: [[TMP8:%.*]] = fmul <8 x float> [[SHUFFLE8]], [[SHUFFLE12]] -; MAX256-NEXT: [[TMP9:%.*]] = fadd <8 x float> zeroinitializer, [[TMP4]] -; MAX256-NEXT: [[TMP10:%.*]] = fadd <8 x float> zeroinitializer, [[TMP6]] -; MAX256-NEXT: [[TMP11:%.*]] = fadd <8 x float> zeroinitializer, [[TMP8]] -; MAX256-NEXT: [[TMP12:%.*]] = fadd <8 x float> zeroinitializer, [[TMP2]] +; MAX256-NEXT: [[TMP3:%.*]] = fadd <8 x float> zeroinitializer, [[TMP2]] +; MAX256-NEXT: [[TMP4:%.*]] = insertelement <8 x float> poison, float [[I3]], i32 0 +; MAX256-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float> [[TMP4]], <8 x float> poison, <8 x i32> zeroinitializer +; MAX256-NEXT: [[TMP5:%.*]] = fmul <8 x float> [[SHUFFLE]], [[SHUFFLE12]] +; MAX256-NEXT: [[TMP6:%.*]] = fadd <8 x float> zeroinitializer, [[TMP5]] +; MAX256-NEXT: [[TMP7:%.*]] = insertelement <8 x float> poison, float [[I6]], i32 0 +; MAX256-NEXT: [[SHUFFLE5:%.*]] = shufflevector <8 x float> [[TMP7]], <8 x float> poison, <8 x i32> zeroinitializer +; MAX256-NEXT: [[TMP8:%.*]] = fmul <8 x float> [[SHUFFLE5]], [[SHUFFLE12]] +; MAX256-NEXT: [[TMP9:%.*]] = fadd <8 x float> zeroinitializer, [[TMP8]] +; MAX256-NEXT: [[TMP10:%.*]] = insertelement <8 x float> poison, float [[I9]], i32 0 +; MAX256-NEXT: [[SHUFFLE8:%.*]] = shufflevector <8 x float> [[TMP10]], <8 x float> poison, <8 x i32> zeroinitializer +; MAX256-NEXT: [[TMP11:%.*]] = fmul <8 x float> [[SHUFFLE8]], [[SHUFFLE12]] +; MAX256-NEXT: [[TMP12:%.*]] = fadd <8 x float> zeroinitializer, [[TMP11]] ; MAX256-NEXT: switch i32 undef, label [[BB5:%.*]] [ ; MAX256-NEXT: i32 0, label [[BB2:%.*]] ; MAX256-NEXT: i32 1, label [[BB3:%.*]] @@ -166,10 +166,10 @@ define void @phi_float32(half %hval, float %fval) { ; MAX256: bb5: ; MAX256-NEXT: br label [[BB2]] ; MAX256: bb2: -; MAX256-NEXT: [[TMP13:%.*]] = phi <8 x float> [ [[TMP9]], [[BB3]] ], [ [[SHUFFLE12]], [[BB4]] ], [ [[SHUFFLE12]], [[BB5]] ], [ [[SHUFFLE12]], [[BB1]] ] -; MAX256-NEXT: [[TMP14:%.*]] = phi <8 x float> [ [[TMP10]], [[BB3]] ], [ [[SHUFFLE12]], [[BB4]] ], [ [[TMP10]], [[BB5]] ], [ [[TMP10]], [[BB1]] ] -; MAX256-NEXT: [[TMP15:%.*]] = phi <8 x float> [ [[TMP11]], [[BB3]] ], [ [[TMP11]], [[BB4]] ], [ [[SHUFFLE12]], [[BB5]] ], [ [[TMP11]], [[BB1]] ] -; MAX256-NEXT: [[TMP16:%.*]] = phi <8 x float> [ [[TMP12]], [[BB3]] ], [ [[TMP12]], [[BB4]] ], [ [[TMP12]], [[BB5]] ], [ [[SHUFFLE12]], [[BB1]] ] +; MAX256-NEXT: [[TMP13:%.*]] = phi <8 x float> [ [[TMP6]], [[BB3]] ], [ [[SHUFFLE12]], [[BB4]] ], [ [[SHUFFLE12]], [[BB5]] ], [ [[SHUFFLE12]], [[BB1]] ] +; MAX256-NEXT: [[TMP14:%.*]] = phi <8 x float> [ [[TMP9]], [[BB3]] ], [ [[SHUFFLE12]], [[BB4]] ], [ [[TMP9]], [[BB5]] ], [ [[TMP9]], [[BB1]] ] +; MAX256-NEXT: [[TMP15:%.*]] = phi <8 x float> [ [[TMP12]], [[BB3]] ], [ [[TMP12]], [[BB4]] ], [ [[SHUFFLE12]], [[BB5]] ], [ [[TMP12]], [[BB1]] ] +; MAX256-NEXT: [[TMP16:%.*]] = phi <8 x float> [ [[TMP3]], [[BB3]] ], [ [[TMP3]], [[BB4]] ], [ [[TMP3]], [[BB5]] ], [ [[SHUFFLE12]], [[BB1]] ] ; MAX256-NEXT: [[TMP17:%.*]] = extractelement <8 x float> [[TMP14]], i32 7 ; MAX256-NEXT: store float [[TMP17]], float* undef, align 4 ; MAX256-NEXT: ret void @@ -179,27 +179,27 @@ define void @phi_float32(half %hval, float %fval) { ; MAX1024-NEXT: br label [[BB1:%.*]] ; MAX1024: bb1: ; MAX1024-NEXT: [[I:%.*]] = fpext half [[HVAL:%.*]] to float +; MAX1024-NEXT: [[I3:%.*]] = fpext half [[HVAL]] to float +; MAX1024-NEXT: [[I6:%.*]] = fpext half [[HVAL]] to float +; MAX1024-NEXT: [[I9:%.*]] = fpext half [[HVAL]] to float ; MAX1024-NEXT: [[TMP0:%.*]] = insertelement <8 x float> poison, float [[I]], i32 0 ; MAX1024-NEXT: [[SHUFFLE11:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <8 x i32> zeroinitializer ; MAX1024-NEXT: [[TMP1:%.*]] = insertelement <8 x float> poison, float [[FVAL:%.*]], i32 0 ; MAX1024-NEXT: [[SHUFFLE12:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <8 x i32> zeroinitializer ; MAX1024-NEXT: [[TMP2:%.*]] = fmul <8 x float> [[SHUFFLE11]], [[SHUFFLE12]] -; MAX1024-NEXT: [[I3:%.*]] = fpext half [[HVAL]] to float -; MAX1024-NEXT: [[TMP3:%.*]] = insertelement <8 x float> poison, float [[I3]], i32 0 -; MAX1024-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <8 x i32> zeroinitializer -; MAX1024-NEXT: [[TMP4:%.*]] = fmul <8 x float> [[SHUFFLE]], [[SHUFFLE12]] -; MAX1024-NEXT: [[I6:%.*]] = fpext half [[HVAL]] to float -; MAX1024-NEXT: [[TMP5:%.*]] = insertelement <8 x float> poison, float [[I6]], i32 0 -; MAX1024-NEXT: [[SHUFFLE5:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <8 x i32> zeroinitializer -; MAX1024-NEXT: [[TMP6:%.*]] = fmul <8 x float> [[SHUFFLE5]], [[SHUFFLE12]] -; MAX1024-NEXT: [[I9:%.*]] = fpext half [[HVAL]] to float -; MAX1024-NEXT: [[TMP7:%.*]] = insertelement <8 x float> poison, float [[I9]], i32 0 -; MAX1024-NEXT: [[SHUFFLE8:%.*]] = shufflevector <8 x float> [[TMP7]], <8 x float> poison, <8 x i32> zeroinitializer -; MAX1024-NEXT: [[TMP8:%.*]] = fmul <8 x float> [[SHUFFLE8]], [[SHUFFLE12]] -; MAX1024-NEXT: [[TMP9:%.*]] = fadd <8 x float> zeroinitializer, [[TMP4]] -; MAX1024-NEXT: [[TMP10:%.*]] = fadd <8 x float> zeroinitializer, [[TMP6]] -; MAX1024-NEXT: [[TMP11:%.*]] = fadd <8 x float> zeroinitializer, [[TMP8]] -; MAX1024-NEXT: [[TMP12:%.*]] = fadd <8 x float> zeroinitializer, [[TMP2]] +; MAX1024-NEXT: [[TMP3:%.*]] = fadd <8 x float> zeroinitializer, [[TMP2]] +; MAX1024-NEXT: [[TMP4:%.*]] = insertelement <8 x float> poison, float [[I3]], i32 0 +; MAX1024-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float> [[TMP4]], <8 x float> poison, <8 x i32> zeroinitializer +; MAX1024-NEXT: [[TMP5:%.*]] = fmul <8 x float> [[SHUFFLE]], [[SHUFFLE12]] +; MAX1024-NEXT: [[TMP6:%.*]] = fadd <8 x float> zeroinitializer, [[TMP5]] +; MAX1024-NEXT: [[TMP7:%.*]] = insertelement <8 x float> poison, float [[I6]], i32 0 +; MAX1024-NEXT: [[SHUFFLE5:%.*]] = shufflevector <8 x float> [[TMP7]], <8 x float> poison, <8 x i32> zeroinitializer +; MAX1024-NEXT: [[TMP8:%.*]] = fmul <8 x float> [[SHUFFLE5]], [[SHUFFLE12]] +; MAX1024-NEXT: [[TMP9:%.*]] = fadd <8 x float> zeroinitializer, [[TMP8]] +; MAX1024-NEXT: [[TMP10:%.*]] = insertelement <8 x float> poison, float [[I9]], i32 0 +; MAX1024-NEXT: [[SHUFFLE8:%.*]] = shufflevector <8 x float> [[TMP10]], <8 x float> poison, <8 x i32> zeroinitializer +; MAX1024-NEXT: [[TMP11:%.*]] = fmul <8 x float> [[SHUFFLE8]], [[SHUFFLE12]] +; MAX1024-NEXT: [[TMP12:%.*]] = fadd <8 x float> zeroinitializer, [[TMP11]] ; MAX1024-NEXT: switch i32 undef, label [[BB5:%.*]] [ ; MAX1024-NEXT: i32 0, label [[BB2:%.*]] ; MAX1024-NEXT: i32 1, label [[BB3:%.*]] @@ -212,10 +212,10 @@ define void @phi_float32(half %hval, float %fval) { ; MAX1024: bb5: ; MAX1024-NEXT: br label [[BB2]] ; MAX1024: bb2: -; MAX1024-NEXT: [[TMP13:%.*]] = phi <8 x float> [ [[TMP9]], [[BB3]] ], [ [[SHUFFLE12]], [[BB4]] ], [ [[SHUFFLE12]], [[BB5]] ], [ [[SHUFFLE12]], [[BB1]] ] -; MAX1024-NEXT: [[TMP14:%.*]] = phi <8 x float> [ [[TMP10]], [[BB3]] ], [ [[SHUFFLE12]], [[BB4]] ], [ [[TMP10]], [[BB5]] ], [ [[TMP10]], [[BB1]] ] -; MAX1024-NEXT: [[TMP15:%.*]] = phi <8 x float> [ [[TMP11]], [[BB3]] ], [ [[TMP11]], [[BB4]] ], [ [[SHUFFLE12]], [[BB5]] ], [ [[TMP11]], [[BB1]] ] -; MAX1024-NEXT: [[TMP16:%.*]] = phi <8 x float> [ [[TMP12]], [[BB3]] ], [ [[TMP12]], [[BB4]] ], [ [[TMP12]], [[BB5]] ], [ [[SHUFFLE12]], [[BB1]] ] +; MAX1024-NEXT: [[TMP13:%.*]] = phi <8 x float> [ [[TMP6]], [[BB3]] ], [ [[SHUFFLE12]], [[BB4]] ], [ [[SHUFFLE12]], [[BB5]] ], [ [[SHUFFLE12]], [[BB1]] ] +; MAX1024-NEXT: [[TMP14:%.*]] = phi <8 x float> [ [[TMP9]], [[BB3]] ], [ [[SHUFFLE12]], [[BB4]] ], [ [[TMP9]], [[BB5]] ], [ [[TMP9]], [[BB1]] ] +; MAX1024-NEXT: [[TMP15:%.*]] = phi <8 x float> [ [[TMP12]], [[BB3]] ], [ [[TMP12]], [[BB4]] ], [ [[SHUFFLE12]], [[BB5]] ], [ [[TMP12]], [[BB1]] ] +; MAX1024-NEXT: [[TMP16:%.*]] = phi <8 x float> [ [[TMP3]], [[BB3]] ], [ [[TMP3]], [[BB4]] ], [ [[TMP3]], [[BB5]] ], [ [[SHUFFLE12]], [[BB1]] ] ; MAX1024-NEXT: [[TMP17:%.*]] = extractelement <8 x float> [[TMP14]], i32 7 ; MAX1024-NEXT: store float [[TMP17]], float* undef, align 4 ; MAX1024-NEXT: ret void -- 2.7.4