From 09e516c54b0c5ec3fe6b3ef6c70d9e09e89abc95 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 14 Nov 2018 13:11:49 +0000 Subject: [PATCH] [VPlan, SLP] Add simple SLP analysis on top of VPlan. MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit This patch adds an initial implementation of the look-ahead SLP tree construction described in 'Look-Ahead SLP: Auto-vectorization in the Presence of Commutative Operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha, Luís F. W. Góes'. It returns an SLP tree represented as VPInstructions, with combined instructions represented as a single, wider VPInstruction. This initial version does not support instructions with multiple different users (either inside or outside the SLP tree) or non-instruction operands; it won't generate any shuffles or insertelement instructions. It also just adds the analysis that builds an SLP tree rooted in a set of stores. It does not include any cost modeling or memory legality checks. The plan is to integrate it with VPlan based cost modeling, once available and to only apply it to operations that can be widened. A follow-up patch will add a support for replacing instructions in a VPlan with their SLP counter parts. Reviewers: Ayal, mssimpso, rengolin, mkuper, hfinkel, hsaito, dcaballe, vporpo, RKSimon, ABataev Reviewed By: rengolin Differential Revision: https://reviews.llvm.org/D4949 llvm-svn: 346857 --- llvm/lib/Transforms/Vectorize/CMakeLists.txt | 1 + llvm/lib/Transforms/Vectorize/VPlan.cpp | 13 + llvm/lib/Transforms/Vectorize/VPlan.h | 125 ++- llvm/lib/Transforms/Vectorize/VPlanSLP.cpp | 469 +++++++++++ llvm/lib/Transforms/Vectorize/VPlanValue.h | 16 + llvm/unittests/Transforms/Vectorize/CMakeLists.txt | 1 + .../Transforms/Vectorize/VPlanSlpTest.cpp | 899 +++++++++++++++++++++ 7 files changed, 1523 insertions(+), 1 deletion(-) create mode 100644 llvm/lib/Transforms/Vectorize/VPlanSLP.cpp create mode 100644 llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt index 27a4d24..06eaadf 100644 --- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt +++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt @@ -7,6 +7,7 @@ add_llvm_library(LLVMVectorize VPlan.cpp VPlanHCFGBuilder.cpp VPlanHCFGTransforms.cpp + VPlanSLP.cpp VPlanVerifier.cpp ADDITIONAL_HEADER_DIRS diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 541378de..dad733b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -338,6 +338,12 @@ void VPInstruction::print(raw_ostream &O) const { case VPInstruction::ICmpULE: O << "icmp ule"; break; + case VPInstruction::SLPLoad: + O << "combined load"; + break; + case VPInstruction::SLPStore: + O << "combined store"; + break; default: O << Instruction::getOpcodeName(getOpcode()); } @@ -681,6 +687,13 @@ void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, template void DomTreeBuilder::Calculate(VPDominatorTree &DT); +void VPValue::replaceAllUsesWith(VPValue *New) { + for (VPUser *User : users()) + for (unsigned I = 0, E = User->getNumOperands(); I < E; ++I) + if (User->getOperand(I) == this) + User->setOperand(I, New); +} + void VPInterleavedAccessInfo::visitRegion(VPRegionBlock *Region, Old2NewTy &Old2New, InterleavedAccessInfo &IAI) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 6633322..dd50db3 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -60,6 +60,7 @@ class Value; class VPBasicBlock; class VPRegionBlock; class VPlan; +class VPlanSlp; /// A range of powers-of-2 vectorization factors with fixed start and /// adjustable end. The range includes start and excludes end, e.g.,: @@ -609,10 +610,16 @@ public: /// the VPInstruction is also a single def-use vertex. class VPInstruction : public VPUser, public VPRecipeBase { friend class VPlanHCFGTransforms; + friend class VPlanSlp; public: /// VPlan opcodes, extending LLVM IR with idiomatics instructions. - enum { Not = Instruction::OtherOpsEnd + 1, ICmpULE }; + enum { + Not = Instruction::OtherOpsEnd + 1, + ICmpULE, + SLPLoad, + SLPStore, + }; private: typedef unsigned char OpcodeTy; @@ -622,6 +629,13 @@ private: /// modeled instruction. void generateInstruction(VPTransformState &State, unsigned Part); +protected: + Instruction *getUnderlyingInstr() { + return cast_or_null(getUnderlyingValue()); + } + + void setUnderlyingInstr(Instruction *I) { setUnderlyingValue(I); } + public: VPInstruction(unsigned Opcode, ArrayRef Operands) : VPUser(VPValue::VPInstructionSC, Operands), @@ -635,6 +649,11 @@ public: return V->getVPValueID() == VPValue::VPInstructionSC; } + VPInstruction *clone() const { + SmallVector Operands(operands()); + return new VPInstruction(Opcode, Operands); + } + /// Method to support type inquiry through isa, cast, and dyn_cast. static inline bool classof(const VPRecipeBase *R) { return R->getVPRecipeID() == VPRecipeBase::VPInstructionSC; @@ -652,6 +671,14 @@ public: /// Print the VPInstruction. void print(raw_ostream &O) const; + + /// Return true if this instruction may modify memory. + bool mayWriteToMemory() const { + // TODO: we can use attributes of the called function to rule out memory + // modifications. + return Opcode == Instruction::Store || Opcode == Instruction::Call || + Opcode == Instruction::Invoke || Opcode == SLPStore; + } }; /// VPWidenRecipe is a recipe for producing a copy of vector type for each @@ -1508,6 +1535,102 @@ public: } }; +/// Class that maps (parts of) an existing VPlan to trees of combined +/// VPInstructions. +class VPlanSlp { +private: + enum class OpMode { Failed, Load, Opcode }; + + /// A DenseMapInfo implementation for using SmallVector as + /// DenseMap keys. + struct BundleDenseMapInfo { + static SmallVector getEmptyKey() { + return {reinterpret_cast(-1)}; + } + + static SmallVector getTombstoneKey() { + return {reinterpret_cast(-2)}; + } + + static unsigned getHashValue(const SmallVector &V) { + return static_cast(hash_combine_range(V.begin(), V.end())); + } + + static bool isEqual(const SmallVector &LHS, + const SmallVector &RHS) { + return LHS == RHS; + } + }; + + /// Mapping of values in the original VPlan to a combined VPInstruction. + DenseMap, VPInstruction *, BundleDenseMapInfo> + BundleToCombined; + + VPInterleavedAccessInfo &IAI; + + /// Basic block to operate on. For now, only instructions in a single BB are + /// considered. + const VPBasicBlock &BB; + + /// Indicates whether we managed to combine all visited instructions or not. + bool CompletelySLP = true; + + /// Width of the widest combined bundle in bits. + unsigned WidestBundleBits = 0; + + using MultiNodeOpTy = + typename std::pair>; + + // Input operand bundles for the current multi node. Each multi node operand + // bundle contains values not matching the multi node's opcode. They will + // be reordered in reorderMultiNodeOps, once we completed building a + // multi node. + SmallVector MultiNodeOps; + + /// Indicates whether we are building a multi node currently. + bool MultiNodeActive = false; + + /// Check if we can vectorize Operands together. + bool areVectorizable(ArrayRef Operands) const; + + /// Add combined instruction \p New for the bundle \p Operands. + void addCombined(ArrayRef Operands, VPInstruction *New); + + /// Indicate we hit a bundle we failed to combine. Returns nullptr for now. + VPInstruction *markFailed(); + + /// Reorder operands in the multi node to maximize sequential memory access + /// and commutative operations. + SmallVector reorderMultiNodeOps(); + + /// Choose the best candidate to use for the lane after \p Last. The set of + /// candidates to choose from are values with an opcode matching \p Last's + /// or loads consecutive to \p Last. + std::pair getBest(OpMode Mode, VPValue *Last, + SmallVectorImpl &Candidates, + VPInterleavedAccessInfo &IAI); + + /// Print bundle \p Values to dbgs(). + void dumpBundle(ArrayRef Values); + +public: + VPlanSlp(VPInterleavedAccessInfo &IAI, VPBasicBlock &BB) : IAI(IAI), BB(BB) {} + + ~VPlanSlp() { + for (auto &KV : BundleToCombined) + delete KV.second; + } + + /// Tries to build an SLP tree rooted at \p Operands and returns a + /// VPInstruction combining \p Operands, if they can be combined. + VPInstruction *buildGraph(ArrayRef Operands); + + /// Return the width of the widest combined bundle in bits. + unsigned getWidestBundleBits() const { return WidestBundleBits; } + + /// Return true if all visited instruction can be combined. + bool isCompletelySLP() const { return CompletelySLP; } +}; } // end namespace llvm #endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H diff --git a/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp b/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp new file mode 100644 index 0000000..679fb51 --- /dev/null +++ b/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp @@ -0,0 +1,469 @@ +//===- VPlanSLP.cpp - SLP Analysis based on VPlan -------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// This file implements SLP analysis based on VPlan. The analysis is based on +/// the ideas described in +/// +/// Look-ahead SLP: auto-vectorization in the presence of commutative +/// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha, +/// Luís F. W. Góes +/// +//===----------------------------------------------------------------------===// + +#include "VPlan.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Twine.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/VectorUtils.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/GraphWriter.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include +#include +#include +#include + +using namespace llvm; + +#define DEBUG_TYPE "vplan-slp" + +// Number of levels to look ahead when re-ordering multi node operands. +static unsigned LookaheadMaxDepth = 5; + +VPInstruction *VPlanSlp::markFailed() { + // FIXME: Currently this is used to signal we hit instructions we cannot + // trivially SLP'ize. + CompletelySLP = false; + return nullptr; +} + +void VPlanSlp::addCombined(ArrayRef Operands, VPInstruction *New) { + if (all_of(Operands, [](VPValue *V) { + return cast(V)->getUnderlyingInstr(); + })) { + unsigned BundleSize = 0; + for (VPValue *V : Operands) { + Type *T = cast(V)->getUnderlyingInstr()->getType(); + assert(!T->isVectorTy() && "Only scalar types supported for now"); + BundleSize += T->getScalarSizeInBits(); + } + WidestBundleBits = std::max(WidestBundleBits, BundleSize); + } + + auto Res = BundleToCombined.try_emplace(to_vector<4>(Operands), New); + assert(Res.second && + "Already created a combined instruction for the operand bundle"); + (void)Res; +} + +bool VPlanSlp::areVectorizable(ArrayRef Operands) const { + // Currently we only support VPInstructions. + if (!all_of(Operands, [](VPValue *Op) { + return Op && isa(Op) && + cast(Op)->getUnderlyingInstr(); + })) { + LLVM_DEBUG(dbgs() << "VPSLP: not all operands are VPInstructions\n"); + return false; + } + + // Check if opcodes and type width agree for all instructions in the bundle. + // FIXME: Differing widths/opcodes can be handled by inserting additional + // instructions. + // FIXME: Deal with non-primitive types. + const Instruction *OriginalInstr = + cast(Operands[0])->getUnderlyingInstr(); + unsigned Opcode = OriginalInstr->getOpcode(); + unsigned Width = OriginalInstr->getType()->getPrimitiveSizeInBits(); + if (!all_of(Operands, [Opcode, Width](VPValue *Op) { + const Instruction *I = cast(Op)->getUnderlyingInstr(); + return I->getOpcode() == Opcode && + I->getType()->getPrimitiveSizeInBits() == Width; + })) { + LLVM_DEBUG(dbgs() << "VPSLP: Opcodes do not agree \n"); + return false; + } + + // For now, all operands must be defined in the same BB. + if (any_of(Operands, [this](VPValue *Op) { + return cast(Op)->getParent() != &this->BB; + })) { + LLVM_DEBUG(dbgs() << "VPSLP: operands in different BBs\n"); + return false; + } + + if (any_of(Operands, + [](VPValue *Op) { return Op->hasMoreThanOneUniqueUser(); })) { + LLVM_DEBUG(dbgs() << "VPSLP: Some operands have multiple users.\n"); + return false; + } + + // For loads, check that there are no instructions writing to memory in + // between them. + // TODO: we only have to forbid instructions writing to memory that could + // interfere with any of the loads in the bundle + if (Opcode == Instruction::Load) { + unsigned LoadsSeen = 0; + VPBasicBlock *Parent = cast(Operands[0])->getParent(); + for (auto &I : *Parent) { + auto *VPI = cast(&I); + if (VPI->getOpcode() == Instruction::Load && + std::find(Operands.begin(), Operands.end(), VPI) != Operands.end()) + LoadsSeen++; + + if (LoadsSeen == Operands.size()) + break; + if (LoadsSeen > 0 && VPI->mayWriteToMemory()) { + LLVM_DEBUG( + dbgs() << "VPSLP: instruction modifying memory between loads\n"); + return false; + } + } + + if (!all_of(Operands, [](VPValue *Op) { + return cast(cast(Op)->getUnderlyingInstr()) + ->isSimple(); + })) { + LLVM_DEBUG(dbgs() << "VPSLP: only simple loads are supported.\n"); + return false; + } + } + + if (Opcode == Instruction::Store) + if (!all_of(Operands, [](VPValue *Op) { + return cast(cast(Op)->getUnderlyingInstr()) + ->isSimple(); + })) { + LLVM_DEBUG(dbgs() << "VPSLP: only simple stores are supported.\n"); + return false; + } + + return true; +} + +static SmallVector getOperands(ArrayRef Values, + unsigned OperandIndex) { + SmallVector Operands; + for (VPValue *V : Values) { + auto *U = cast(V); + Operands.push_back(U->getOperand(OperandIndex)); + } + return Operands; +} + +static bool areCommutative(ArrayRef Values) { + return Instruction::isCommutative( + cast(Values[0])->getOpcode()); +} + +static SmallVector, 4> +getOperands(ArrayRef Values) { + SmallVector, 4> Result; + auto *VPI = cast(Values[0]); + + switch (VPI->getOpcode()) { + case Instruction::Load: + llvm_unreachable("Loads terminate a tree, no need to get operands"); + case Instruction::Store: + Result.push_back(getOperands(Values, 0)); + break; + default: + for (unsigned I = 0, NumOps = VPI->getNumOperands(); I < NumOps; ++I) + Result.push_back(getOperands(Values, I)); + break; + } + + return Result; +} + +/// Returns the opcode of Values or ~0 if they do not all agree. +static Optional getOpcode(ArrayRef Values) { + unsigned Opcode = cast(Values[0])->getOpcode(); + if (any_of(Values, [Opcode](VPValue *V) { + return cast(V)->getOpcode() != Opcode; + })) + return None; + return {Opcode}; +} + +/// Returns true if A and B access sequential memory if they are loads or +/// stores or if they have identical opcodes otherwise. +static bool areConsecutiveOrMatch(VPInstruction *A, VPInstruction *B, + VPInterleavedAccessInfo &IAI) { + if (A->getOpcode() != B->getOpcode()) + return false; + + if (A->getOpcode() != Instruction::Load && + A->getOpcode() != Instruction::Store) + return true; + auto *GA = IAI.getInterleaveGroup(A); + auto *GB = IAI.getInterleaveGroup(B); + + return GA && GB && GA == GB && GA->getIndex(A) + 1 == GB->getIndex(B); +} + +/// Implements getLAScore from Listing 7 in the paper. +/// Traverses and compares operands of V1 and V2 to MaxLevel. +static unsigned getLAScore(VPValue *V1, VPValue *V2, unsigned MaxLevel, + VPInterleavedAccessInfo &IAI) { + if (!isa(V1) || !isa(V2)) + return 0; + + if (MaxLevel == 0) + return (unsigned)areConsecutiveOrMatch(cast(V1), + cast(V2), IAI); + + unsigned Score = 0; + for (unsigned I = 0, EV1 = cast(V1)->getNumOperands(); I < EV1; ++I) + for (unsigned J = 0, EV2 = cast(V2)->getNumOperands(); J < EV2; ++J) + Score += getLAScore(cast(V1)->getOperand(I), + cast(V2)->getOperand(J), MaxLevel - 1, IAI); + return Score; +} + +std::pair +VPlanSlp::getBest(OpMode Mode, VPValue *Last, + SmallVectorImpl &Candidates, + VPInterleavedAccessInfo &IAI) { + LLVM_DEBUG(dbgs() << " getBest\n"); + VPValue *Best = Candidates[0]; + SmallVector BestCandidates; + + LLVM_DEBUG(dbgs() << " Candidates for " + << *cast(Last)->getUnderlyingInstr() << " "); + for (auto *Candidate : Candidates) { + auto *LastI = cast(Last); + auto *CandidateI = cast(Candidate); + if (areConsecutiveOrMatch(LastI, CandidateI, IAI)) { + LLVM_DEBUG(dbgs() << *cast(Candidate)->getUnderlyingInstr() + << " "); + BestCandidates.push_back(Candidate); + } + } + LLVM_DEBUG(dbgs() << "\n"); + + if (BestCandidates.empty()) + return {OpMode::Failed, nullptr}; + + if (BestCandidates.size() == 1) + return {Mode, BestCandidates[0]}; + + if (Mode == OpMode::Opcode) { + unsigned BestScore = 0; + for (unsigned Depth = 1; Depth < LookaheadMaxDepth; Depth++) { + unsigned PrevScore = ~0u; + bool AllSame = true; + + // FIXME: Avoid visiting the same operands multiple times. + for (auto *Candidate : BestCandidates) { + unsigned Score = getLAScore(Last, Candidate, Depth, IAI); + if (PrevScore == ~0u) + PrevScore = Score; + if (PrevScore != Score) + AllSame = false; + PrevScore = Score; + + if (Score > BestScore) { + BestScore = Score; + Best = Candidate; + } + } + if (!AllSame) + break; + } + } + LLVM_DEBUG(dbgs() << "Found best " + << *cast(Best)->getUnderlyingInstr() + << "\n"); + std::remove(Candidates.begin(), Candidates.end(), Best); + + return {Mode, Best}; +} + +SmallVector VPlanSlp::reorderMultiNodeOps() { + SmallVector FinalOrder; + SmallVector Mode; + FinalOrder.reserve(MultiNodeOps.size()); + Mode.reserve(MultiNodeOps.size()); + + LLVM_DEBUG(dbgs() << "Reordering multinode\n"); + + for (auto &Operands : MultiNodeOps) { + FinalOrder.push_back({Operands.first, {Operands.second[0]}}); + if (cast(Operands.second[0])->getOpcode() == + Instruction::Load) + Mode.push_back(OpMode::Load); + else + Mode.push_back(OpMode::Opcode); + } + + for (unsigned Lane = 1, E = MultiNodeOps[0].second.size(); Lane < E; ++Lane) { + LLVM_DEBUG(dbgs() << " Finding best value for lane " << Lane << "\n"); + SmallVector Candidates; + Candidates.reserve(MultiNodeOps.size()); + LLVM_DEBUG(dbgs() << " Candidates "); + for (auto Ops : MultiNodeOps) { + LLVM_DEBUG( + dbgs() << *cast(Ops.second[Lane])->getUnderlyingInstr() + << " "); + Candidates.push_back(Ops.second[Lane]); + } + LLVM_DEBUG(dbgs() << "\n"); + + for (unsigned Op = 0, E = MultiNodeOps.size(); Op < E; ++Op) { + LLVM_DEBUG(dbgs() << " Checking " << Op << "\n"); + if (Mode[Op] == OpMode::Failed) + continue; + + VPValue *Last = FinalOrder[Op].second[Lane - 1]; + std::pair Res = + getBest(Mode[Op], Last, Candidates, IAI); + if (Res.second) + FinalOrder[Op].second.push_back(Res.second); + else + // TODO: handle this case + FinalOrder[Op].second.push_back(markFailed()); + } + } + + return FinalOrder; +} + +void VPlanSlp::dumpBundle(ArrayRef Values) { + LLVM_DEBUG(dbgs() << " Ops: "); + for (auto Op : Values) + if (auto *Instr = cast_or_null(Op)->getUnderlyingInstr()) + LLVM_DEBUG(dbgs() << *Instr << " | "); + else + LLVM_DEBUG(dbgs() << " nullptr | "); + LLVM_DEBUG(dbgs() << "\n"); +} + +VPInstruction *VPlanSlp::buildGraph(ArrayRef Values) { + assert(!Values.empty() && "Need some operands!"); + + // If we already visited this instruction bundle, re-use the existing node + auto I = BundleToCombined.find(to_vector<4>(Values)); + if (I != BundleToCombined.end()) { +#ifdef NDEBUG + // Check that the resulting graph is a tree. If we re-use a node, this means + // its values have multiple users. We only allow this, if all users of each + // value are the same instruction. + for (auto *V : Values) { + auto UI = V->user_begin(); + auto *FirstUser = *UI++; + while (UI != V->use_end()) { + assert(*UI == FirstUser && "Currently we only support SLP trees."); + UI++; + } + } +#endif + return I->second; + } + + // Dump inputs + LLVM_DEBUG({ + dbgs() << "buildGraph: "; + dumpBundle(Values); + }); + + if (!areVectorizable(Values)) + return markFailed(); + + assert(getOpcode(Values) && "Opcodes for all values must match"); + unsigned ValuesOpcode = getOpcode(Values).getValue(); + + SmallVector CombinedOperands; + if (areCommutative(Values)) { + bool MultiNodeRoot = !MultiNodeActive; + MultiNodeActive = true; + for (auto &Operands : getOperands(Values)) { + LLVM_DEBUG({ + dbgs() << " Visiting Commutative"; + dumpBundle(Operands); + }); + + auto OperandsOpcode = getOpcode(Operands); + if (OperandsOpcode && OperandsOpcode == getOpcode(Values)) { + LLVM_DEBUG(dbgs() << " Same opcode, continue building\n"); + CombinedOperands.push_back(buildGraph(Operands)); + } else { + LLVM_DEBUG(dbgs() << " Adding multinode Ops\n"); + // Create dummy VPInstruction, which will we replace later by the + // re-ordered operand. + VPInstruction *Op = new VPInstruction(0, {}); + CombinedOperands.push_back(Op); + MultiNodeOps.emplace_back(Op, Operands); + } + } + + if (MultiNodeRoot) { + LLVM_DEBUG(dbgs() << "Reorder \n"); + MultiNodeActive = false; + + auto FinalOrder = reorderMultiNodeOps(); + + MultiNodeOps.clear(); + for (auto &Ops : FinalOrder) { + VPInstruction *NewOp = buildGraph(Ops.second); + Ops.first->replaceAllUsesWith(NewOp); + for (unsigned i = 0; i < CombinedOperands.size(); i++) + if (CombinedOperands[i] == Ops.first) + CombinedOperands[i] = NewOp; + delete Ops.first; + Ops.first = NewOp; + } + LLVM_DEBUG(dbgs() << "Found final order\n"); + } + } else { + LLVM_DEBUG(dbgs() << " NonCommuntative\n"); + if (ValuesOpcode == Instruction::Load) + for (VPValue *V : Values) + CombinedOperands.push_back(cast(V)->getOperand(0)); + else + for (auto &Operands : getOperands(Values)) + CombinedOperands.push_back(buildGraph(Operands)); + } + + unsigned Opcode; + switch (ValuesOpcode) { + case Instruction::Load: + Opcode = VPInstruction::SLPLoad; + break; + case Instruction::Store: + Opcode = VPInstruction::SLPStore; + break; + default: + Opcode = ValuesOpcode; + break; + } + + if (!CompletelySLP) + return markFailed(); + + assert(CombinedOperands.size() > 0 && "Need more some operands"); + auto *VPI = new VPInstruction(Opcode, CombinedOperands); + VPI->setUnderlyingInstr(cast(Values[0])->getUnderlyingInstr()); + + LLVM_DEBUG(dbgs() << "Create VPInstruction "; VPI->print(dbgs()); + cast(Values[0])->print(dbgs()); dbgs() << "\n"); + addCombined(Values, VPI); + return VPI; +} diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 972f0fa..b473579 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -106,6 +106,20 @@ public: const_user_range users() const { return const_user_range(user_begin(), user_end()); } + + /// Returns true if the value has more than one unique user. + bool hasMoreThanOneUniqueUser() { + if (getNumUsers() == 0) + return false; + + // Check if all users match the first user. + auto Current = std::next(user_begin()); + while (Current != user_end() && *user_begin() == *Current) + Current++; + return Current != user_end(); + } + + void replaceAllUsesWith(VPValue *New); }; typedef DenseMap Value2VPValueTy; @@ -151,6 +165,8 @@ public: return Operands[N]; } + void setOperand(unsigned I, VPValue *New) { Operands[I] = New; } + typedef SmallVectorImpl::iterator operand_iterator; typedef SmallVectorImpl::const_iterator const_operand_iterator; typedef iterator_range operand_range; diff --git a/llvm/unittests/Transforms/Vectorize/CMakeLists.txt b/llvm/unittests/Transforms/Vectorize/CMakeLists.txt index 5a9142d..6e886bb 100644 --- a/llvm/unittests/Transforms/Vectorize/CMakeLists.txt +++ b/llvm/unittests/Transforms/Vectorize/CMakeLists.txt @@ -10,4 +10,5 @@ add_llvm_unittest(VectorizeTests VPlanLoopInfoTest.cpp VPlanTest.cpp VPlanHCFGTest.cpp + VPlanSlpTest.cpp ) diff --git a/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp new file mode 100644 index 0000000..0381925 --- /dev/null +++ b/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp @@ -0,0 +1,899 @@ +//===- llvm/unittest/Transforms/Vectorize/VPlanSlpTest.cpp ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "../lib/Transforms/Vectorize/VPlan.h" +#include "../lib/Transforms/Vectorize/VPlanHCFGBuilder.h" +#include "../lib/Transforms/Vectorize/VPlanHCFGTransforms.h" +#include "VPlanTestBase.h" +#include "llvm/Analysis/VectorUtils.h" +#include "gtest/gtest.h" + +namespace llvm { +namespace { + +class VPlanSlpTest : public VPlanTestBase { +protected: + TargetLibraryInfoImpl TLII; + TargetLibraryInfo TLI; + DataLayout DL; + + std::unique_ptr AC; + std::unique_ptr SE; + std::unique_ptr AARes; + std::unique_ptr BasicAA; + std::unique_ptr LAI; + std::unique_ptr PSE; + std::unique_ptr IAI; + + VPlanSlpTest() + : TLII(), TLI(TLII), + DL("e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-" + "f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:" + "16:32:64-S128") {} + + VPInterleavedAccessInfo getInterleavedAccessInfo(Function &F, Loop *L, + VPlan &Plan) { + AC.reset(new AssumptionCache(F)); + SE.reset(new ScalarEvolution(F, TLI, *AC, *DT, *LI)); + BasicAA.reset(new BasicAAResult(DL, F, TLI, *AC, &*DT, &*LI)); + AARes.reset(new AAResults(TLI)); + AARes->addAAResult(*BasicAA); + PSE.reset(new PredicatedScalarEvolution(*SE, *L)); + LAI.reset(new LoopAccessInfo(L, &*SE, &TLI, &*AARes, &*DT, &*LI)); + IAI.reset(new InterleavedAccessInfo(*PSE, L, &*DT, &*LI, &*LAI)); + IAI->analyzeInterleaving(false); + return {Plan, *IAI}; + } +}; + +TEST_F(VPlanSlpTest, testSlpSimple_2) { + const char *ModuleString = + "%struct.Test = type { i32, i32 }\n" + "%struct.Test3 = type { i32, i32, i32 }\n" + "%struct.Test4xi8 = type { i8, i8, i8 }\n" + "define void @add_x2(%struct.Test* nocapture readonly %A, %struct.Test* " + "nocapture readonly %B, %struct.Test* nocapture %C) {\n" + "entry:\n" + " br label %for.body\n" + "for.body: ; preds = %for.body, " + "%entry\n" + " %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n" + " %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 0\n" + " %vA0 = load i32, i32* %A0, align 4\n" + " %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + "%indvars.iv, i32 0\n" + " %vB0 = load i32, i32* %B0, align 4\n" + " %add0 = add nsw i32 %vA0, %vB0\n" + " %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 1\n" + " %vA1 = load i32, i32* %A1, align 4\n" + " %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + "%indvars.iv, i32 1\n" + " %vB1 = load i32, i32* %B1, align 4\n" + " %add1 = add nsw i32 %vA1, %vB1\n" + " %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 0\n" + " store i32 %add0, i32* %C0, align 4\n" + " %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 1\n" + " store i32 %add1, i32* %C1, align 4\n" + " %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n" + " %exitcond = icmp eq i64 %indvars.iv.next, 1024\n" + " br i1 %exitcond, label %for.cond.cleanup, label %for.body\n" + "for.cond.cleanup: ; preds = %for.body\n" + " ret void\n" + "}\n"; + + Module &M = parseModule(ModuleString); + + Function *F = M.getFunction("add_x2"); + BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor(); + auto Plan = buildHCFG(LoopHeader); + auto VPIAI = getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan); + + VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock(); + EXPECT_NE(nullptr, Entry->getSingleSuccessor()); + VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock(); + + VPInstruction *Store1 = cast(&*std::next(Body->begin(), 12)); + VPInstruction *Store2 = cast(&*std::next(Body->begin(), 14)); + + VPlanSlp Slp(VPIAI, *Body); + SmallVector StoreRoot = {Store1, Store2}; + VPInstruction *CombinedStore = Slp.buildGraph(StoreRoot); + EXPECT_EQ(64u, Slp.getWidestBundleBits()); + EXPECT_EQ(VPInstruction::SLPStore, CombinedStore->getOpcode()); + + auto *CombinedAdd = cast(CombinedStore->getOperand(0)); + EXPECT_EQ(Instruction::Add, CombinedAdd->getOpcode()); + + auto *CombinedLoadA = cast(CombinedAdd->getOperand(0)); + auto *CombinedLoadB = cast(CombinedAdd->getOperand(1)); + EXPECT_EQ(VPInstruction::SLPLoad, CombinedLoadA->getOpcode()); + EXPECT_EQ(VPInstruction::SLPLoad, CombinedLoadB->getOpcode()); +} + +TEST_F(VPlanSlpTest, testSlpSimple_3) { + const char *ModuleString = + "%struct.Test = type { i32, i32 }\n" + "%struct.Test3 = type { i32, i32, i32 }\n" + "%struct.Test4xi8 = type { i8, i8, i8 }\n" + "define void @add_x2(%struct.Test* nocapture readonly %A, %struct.Test* " + "nocapture readonly %B, %struct.Test* nocapture %C) {\n" + "entry:\n" + " br label %for.body\n" + "for.body: ; preds = %for.body, " + "%entry\n" + " %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n" + " %A0 = getelementptr %struct.Test, %struct.Test* %A, i64 " + " %indvars.iv, i32 0\n" + " %vA0 = load i32, i32* %A0, align 4\n" + " %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + " %indvars.iv, i32 0\n" + " %vB0 = load i32, i32* %B0, align 4\n" + " %add0 = add nsw i32 %vA0, %vB0\n" + " %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + " %indvars.iv, i32 1\n" + " %vA1 = load i32, i32* %A1, align 4\n" + " %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + " %indvars.iv, i32 1\n" + " %vB1 = load i32, i32* %B1, align 4\n" + " %add1 = add nsw i32 %vA1, %vB1\n" + " %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + " %indvars.iv, i32 0\n" + " store i32 %add0, i32* %C0, align 4\n" + " %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + " %indvars.iv, i32 1\n" + " store i32 %add1, i32* %C1, align 4\n" + " %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n" + " %exitcond = icmp eq i64 %indvars.iv.next, 1024\n" + " br i1 %exitcond, label %for.cond.cleanup, label %for.body\n" + "for.cond.cleanup: ; preds = %for.body\n" + " ret void\n" + "}\n"; + + Module &M = parseModule(ModuleString); + + Function *F = M.getFunction("add_x2"); + BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor(); + auto Plan = buildHCFG(LoopHeader); + + VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock(); + EXPECT_NE(nullptr, Entry->getSingleSuccessor()); + VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock(); + + VPInstruction *Store1 = cast(&*std::next(Body->begin(), 12)); + VPInstruction *Store2 = cast(&*std::next(Body->begin(), 14)); + + auto VPIAI = getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan); + + VPlanSlp Slp(VPIAI, *Body); + SmallVector StoreRoot = {Store1, Store2}; + VPInstruction *CombinedStore = Slp.buildGraph(StoreRoot); + EXPECT_EQ(64u, Slp.getWidestBundleBits()); + EXPECT_EQ(VPInstruction::SLPStore, CombinedStore->getOpcode()); + + auto *CombinedAdd = cast(CombinedStore->getOperand(0)); + EXPECT_EQ(Instruction::Add, CombinedAdd->getOpcode()); + + auto *CombinedLoadA = cast(CombinedAdd->getOperand(0)); + auto *CombinedLoadB = cast(CombinedAdd->getOperand(1)); + EXPECT_EQ(VPInstruction::SLPLoad, CombinedLoadA->getOpcode()); + EXPECT_EQ(VPInstruction::SLPLoad, CombinedLoadB->getOpcode()); + + VPInstruction *GetA = cast(&*std::next(Body->begin(), 1)); + VPInstruction *GetB = cast(&*std::next(Body->begin(), 3)); + EXPECT_EQ(GetA, CombinedLoadA->getOperand(0)); + EXPECT_EQ(GetB, CombinedLoadB->getOperand(0)); +} + +TEST_F(VPlanSlpTest, testSlpReuse_1) { + const char *ModuleString = + "%struct.Test = type { i32, i32 }\n" + "define void @add_x2(%struct.Test* nocapture readonly %A, %struct.Test* " + "nocapture readonly %B, %struct.Test* nocapture %C) {\n" + "entry:\n" + " br label %for.body\n" + "for.body: ; preds = %for.body, " + "%entry\n" + " %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n" + " %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 0\n" + " %vA0 = load i32, i32* %A0, align 4\n" + " %add0 = add nsw i32 %vA0, %vA0\n" + " %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 1\n" + " %vA1 = load i32, i32* %A1, align 4\n" + " %add1 = add nsw i32 %vA1, %vA1\n" + " %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 0\n" + " store i32 %add0, i32* %C0, align 4\n" + " %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 1\n" + " store i32 %add1, i32* %C1, align 4\n" + " %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n" + " %exitcond = icmp eq i64 %indvars.iv.next, 1024\n" + " br i1 %exitcond, label %for.cond.cleanup, label %for.body\n" + "for.cond.cleanup: ; preds = %for.body\n" + " ret void\n" + "}\n"; + + Module &M = parseModule(ModuleString); + + Function *F = M.getFunction("add_x2"); + BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor(); + auto Plan = buildHCFG(LoopHeader); + auto VPIAI = getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan); + + VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock(); + EXPECT_NE(nullptr, Entry->getSingleSuccessor()); + VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock(); + + VPInstruction *Store1 = cast(&*std::next(Body->begin(), 8)); + VPInstruction *Store2 = cast(&*std::next(Body->begin(), 10)); + + VPlanSlp Slp(VPIAI, *Body); + SmallVector StoreRoot = {Store1, Store2}; + VPInstruction *CombinedStore = Slp.buildGraph(StoreRoot); + EXPECT_EQ(64u, Slp.getWidestBundleBits()); + EXPECT_EQ(VPInstruction::SLPStore, CombinedStore->getOpcode()); + + auto *CombinedAdd = cast(CombinedStore->getOperand(0)); + EXPECT_EQ(Instruction::Add, CombinedAdd->getOpcode()); + + auto *CombinedLoadA = cast(CombinedAdd->getOperand(0)); + EXPECT_EQ(CombinedLoadA, CombinedAdd->getOperand(1)); + EXPECT_EQ(VPInstruction::SLPLoad, CombinedLoadA->getOpcode()); +} + +TEST_F(VPlanSlpTest, testSlpReuse_2) { + const char *ModuleString = + "%struct.Test = type { i32, i32 }\n" + "define i32 @add_x2(%struct.Test* nocapture readonly %A, %struct.Test* " + "nocapture readonly %B, %struct.Test* nocapture %C) {\n" + "entry:\n" + " br label %for.body\n" + "for.body: ; preds = %for.body, " + "%entry\n" + " %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n" + " %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 0\n" + " %vA0 = load i32, i32* %A0, align 4\n" + " %add0 = add nsw i32 %vA0, %vA0\n" + " %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 0\n" + " store i32 %add0, i32* %C0, align 4\n" + " %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 1\n" + " %vA1 = load i32, i32* %A1, align 4\n" + " %add1 = add nsw i32 %vA1, %vA1\n" + " %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 1\n" + " store i32 %add1, i32* %C1, align 4\n" + " %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n" + " %exitcond = icmp eq i64 %indvars.iv.next, 1024\n" + " br i1 %exitcond, label %for.cond.cleanup, label %for.body\n" + "for.cond.cleanup: ; preds = %for.body\n" + " ret i32 %vA1\n" + "}\n"; + + Module &M = parseModule(ModuleString); + + Function *F = M.getFunction("add_x2"); + BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor(); + auto Plan = buildHCFG(LoopHeader); + auto VPIAI = getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan); + + VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock(); + EXPECT_NE(nullptr, Entry->getSingleSuccessor()); + VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock(); + + VPInstruction *Store1 = cast(&*std::next(Body->begin(), 5)); + VPInstruction *Store2 = cast(&*std::next(Body->begin(), 10)); + + VPlanSlp Slp(VPIAI, *Body); + SmallVector StoreRoot = {Store1, Store2}; + Slp.buildGraph(StoreRoot); + EXPECT_FALSE(Slp.isCompletelySLP()); +} + +static void checkReorderExample(VPInstruction *Store1, VPInstruction *Store2, + VPBasicBlock *Body, + VPInterleavedAccessInfo &&IAI) { + VPlanSlp Slp(IAI, *Body); + SmallVector StoreRoot = {Store1, Store2}; + VPInstruction *CombinedStore = Slp.buildGraph(StoreRoot); + + EXPECT_TRUE(Slp.isCompletelySLP()); + EXPECT_EQ(CombinedStore->getOpcode(), VPInstruction::SLPStore); + + VPInstruction *CombinedAdd = + cast(CombinedStore->getOperand(0)); + EXPECT_EQ(CombinedAdd->getOpcode(), Instruction::Add); + + VPInstruction *CombinedMulAB = + cast(CombinedAdd->getOperand(0)); + VPInstruction *CombinedMulCD = + cast(CombinedAdd->getOperand(1)); + EXPECT_EQ(CombinedMulAB->getOpcode(), Instruction::Mul); + + VPInstruction *CombinedLoadA = + cast(CombinedMulAB->getOperand(0)); + EXPECT_EQ(VPInstruction::SLPLoad, CombinedLoadA->getOpcode()); + VPInstruction *LoadvA0 = cast(&*std::next(Body->begin(), 2)); + VPInstruction *LoadvA1 = cast(&*std::next(Body->begin(), 12)); + EXPECT_EQ(LoadvA0->getOperand(0), CombinedLoadA->getOperand(0)); + EXPECT_EQ(LoadvA1->getOperand(0), CombinedLoadA->getOperand(1)); + + VPInstruction *CombinedLoadB = + cast(CombinedMulAB->getOperand(1)); + EXPECT_EQ(VPInstruction::SLPLoad, CombinedLoadB->getOpcode()); + VPInstruction *LoadvB0 = cast(&*std::next(Body->begin(), 4)); + VPInstruction *LoadvB1 = cast(&*std::next(Body->begin(), 14)); + EXPECT_EQ(LoadvB0->getOperand(0), CombinedLoadB->getOperand(0)); + EXPECT_EQ(LoadvB1->getOperand(0), CombinedLoadB->getOperand(1)); + + EXPECT_EQ(CombinedMulCD->getOpcode(), Instruction::Mul); + + VPInstruction *CombinedLoadC = + cast(CombinedMulCD->getOperand(0)); + EXPECT_EQ(VPInstruction::SLPLoad, CombinedLoadC->getOpcode()); + VPInstruction *LoadvC0 = cast(&*std::next(Body->begin(), 7)); + VPInstruction *LoadvC1 = cast(&*std::next(Body->begin(), 17)); + EXPECT_EQ(LoadvC0->getOperand(0), CombinedLoadC->getOperand(0)); + EXPECT_EQ(LoadvC1->getOperand(0), CombinedLoadC->getOperand(1)); + + VPInstruction *CombinedLoadD = + cast(CombinedMulCD->getOperand(1)); + EXPECT_EQ(VPInstruction::SLPLoad, CombinedLoadD->getOpcode()); + VPInstruction *LoadvD0 = cast(&*std::next(Body->begin(), 9)); + VPInstruction *LoadvD1 = cast(&*std::next(Body->begin(), 19)); + EXPECT_EQ(LoadvD0->getOperand(0), CombinedLoadD->getOperand(0)); + EXPECT_EQ(LoadvD1->getOperand(0), CombinedLoadD->getOperand(1)); +} + +TEST_F(VPlanSlpTest, testSlpReorder_1) { + LLVMContext Ctx; + const char *ModuleString = + "%struct.Test = type { i32, i32 }\n" + "define void @add_x3(%struct.Test* %A, %struct.Test* %B, %struct.Test* " + "%C, %struct.Test* %D, %struct.Test* %E) {\n" + "entry:\n" + " br label %for.body\n" + "for.body: ; preds = %for.body, " + "%entry\n" + " %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n" + " %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 0\n" + " %vA0 = load i32, i32* %A0, align 4\n" + " %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + "%indvars.iv, i32 0\n" + " %vB0 = load i32, i32* %B0, align 4\n" + " %mul11 = mul nsw i32 %vA0, %vB0\n" + " %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 0\n" + " %vC0 = load i32, i32* %C0, align 4\n" + " %D0 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 " + "%indvars.iv, i32 0\n" + " %vD0 = load i32, i32* %D0, align 4\n" + " %mul12 = mul nsw i32 %vC0, %vD0\n" + " %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 1\n" + " %vA1 = load i32, i32* %A1, align 4\n" + " %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + "%indvars.iv, i32 1\n" + " %vB1 = load i32, i32* %B1, align 4\n" + " %mul21 = mul nsw i32 %vA1, %vB1\n" + " %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 1\n" + " %vC1 = load i32, i32* %C1, align 4\n" + " %D1 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 " + "%indvars.iv, i32 1\n" + " %vD1 = load i32, i32* %D1, align 4\n" + " %mul22 = mul nsw i32 %vC1, %vD1\n" + " %add1 = add nsw i32 %mul11, %mul12\n" + " %add2 = add nsw i32 %mul22, %mul21\n" + " %E0 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 " + "%indvars.iv, i32 0\n" + " store i32 %add1, i32* %E0, align 4\n" + " %E1 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 " + "%indvars.iv, i32 1\n" + " store i32 %add2, i32* %E1, align 4\n" + " %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n" + " %exitcond = icmp eq i64 %indvars.iv.next, 1024\n" + " br i1 %exitcond, label %for.cond.cleanup, label %for.body\n" + "for.cond.cleanup: ; preds = %for.body\n" + " ret void\n" + "}\n"; + + Module &M = parseModule(ModuleString); + + Function *F = M.getFunction("add_x3"); + BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor(); + auto Plan = buildHCFG(LoopHeader); + + VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock(); + EXPECT_NE(nullptr, Entry->getSingleSuccessor()); + VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock(); + + VPInstruction *Store1 = cast(&*std::next(Body->begin(), 24)); + VPInstruction *Store2 = cast(&*std::next(Body->begin(), 26)); + + checkReorderExample( + Store1, Store2, Body, + getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan)); +} + +TEST_F(VPlanSlpTest, testSlpReorder_2) { + LLVMContext Ctx; + const char *ModuleString = + "%struct.Test = type { i32, i32 }\n" + "define void @add_x3(%struct.Test* %A, %struct.Test* %B, %struct.Test* " + "%C, %struct.Test* %D, %struct.Test* %E) {\n" + "entry:\n" + " br label %for.body\n" + "for.body: ; preds = %for.body, " + "%entry\n" + " %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n" + " %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 0\n" + " %vA0 = load i32, i32* %A0, align 4\n" + " %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + "%indvars.iv, i32 0\n" + " %vB0 = load i32, i32* %B0, align 4\n" + " %mul11 = mul nsw i32 %vA0, %vB0\n" + " %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 0\n" + " %vC0 = load i32, i32* %C0, align 4\n" + " %D0 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 " + "%indvars.iv, i32 0\n" + " %vD0 = load i32, i32* %D0, align 4\n" + " %mul12 = mul nsw i32 %vC0, %vD0\n" + " %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 1\n" + " %vA1 = load i32, i32* %A1, align 4\n" + " %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + "%indvars.iv, i32 1\n" + " %vB1 = load i32, i32* %B1, align 4\n" + " %mul21 = mul nsw i32 %vB1, %vA1\n" + " %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 1\n" + " %vC1 = load i32, i32* %C1, align 4\n" + " %D1 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 " + "%indvars.iv, i32 1\n" + " %vD1 = load i32, i32* %D1, align 4\n" + " %mul22 = mul nsw i32 %vD1, %vC1\n" + " %add1 = add nsw i32 %mul11, %mul12\n" + " %add2 = add nsw i32 %mul22, %mul21\n" + " %E0 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 " + "%indvars.iv, i32 0\n" + " store i32 %add1, i32* %E0, align 4\n" + " %E1 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 " + "%indvars.iv, i32 1\n" + " store i32 %add2, i32* %E1, align 4\n" + " %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n" + " %exitcond = icmp eq i64 %indvars.iv.next, 1024\n" + " br i1 %exitcond, label %for.cond.cleanup, label %for.body\n" + "for.cond.cleanup: ; preds = %for.body\n" + " ret void\n" + "}\n"; + + Module &M = parseModule(ModuleString); + + Function *F = M.getFunction("add_x3"); + BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor(); + auto Plan = buildHCFG(LoopHeader); + + VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock(); + EXPECT_NE(nullptr, Entry->getSingleSuccessor()); + VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock(); + + VPInstruction *Store1 = cast(&*std::next(Body->begin(), 24)); + VPInstruction *Store2 = cast(&*std::next(Body->begin(), 26)); + + checkReorderExample( + Store1, Store2, Body, + getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan)); +} + +TEST_F(VPlanSlpTest, testSlpReorder_3) { + LLVMContext Ctx; + const char *ModuleString = + "%struct.Test = type { i32, i32 }\n" + "define void @add_x3(%struct.Test* %A, %struct.Test* %B, %struct.Test* " + "%C, %struct.Test* %D, %struct.Test* %E) {\n" + "entry:\n" + " br label %for.body\n" + "for.body: ; preds = %for.body, " + "%entry\n" + " %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n" + " %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 1\n" + " %vA1 = load i32, i32* %A1, align 4\n" + " %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + "%indvars.iv, i32 0\n" + " %vB0 = load i32, i32* %B0, align 4\n" + " %mul11 = mul nsw i32 %vA1, %vB0\n" + " %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 0\n" + " %vC0 = load i32, i32* %C0, align 4\n" + " %D0 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 " + "%indvars.iv, i32 0\n" + " %vD0 = load i32, i32* %D0, align 4\n" + " %mul12 = mul nsw i32 %vC0, %vD0\n" + " %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 0\n" + " %vA0 = load i32, i32* %A0, align 4\n" + " %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + "%indvars.iv, i32 1\n" + " %vB1 = load i32, i32* %B1, align 4\n" + " %mul21 = mul nsw i32 %vB1, %vA0\n" + " %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 1\n" + " %vC1 = load i32, i32* %C1, align 4\n" + " %D1 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 " + "%indvars.iv, i32 1\n" + " %vD1 = load i32, i32* %D1, align 4\n" + " %mul22 = mul nsw i32 %vD1, %vC1\n" + " %add1 = add nsw i32 %mul11, %mul12\n" + " %add2 = add nsw i32 %mul22, %mul21\n" + " %E0 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 " + "%indvars.iv, i32 0\n" + " store i32 %add1, i32* %E0, align 4\n" + " %E1 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 " + "%indvars.iv, i32 1\n" + " store i32 %add2, i32* %E1, align 4\n" + " %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n" + " %exitcond = icmp eq i64 %indvars.iv.next, 1024\n" + " br i1 %exitcond, label %for.cond.cleanup, label %for.body\n" + "for.cond.cleanup: ; preds = %for.body\n" + " ret void\n" + "}\n"; + + Module &M = parseModule(ModuleString); + + Function *F = M.getFunction("add_x3"); + BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor(); + auto Plan = buildHCFG(LoopHeader); + + VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock(); + EXPECT_NE(nullptr, Entry->getSingleSuccessor()); + VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock(); + + VPInstruction *Store1 = cast(&*std::next(Body->begin(), 24)); + VPInstruction *Store2 = cast(&*std::next(Body->begin(), 26)); + + auto VPIAI = getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan); + VPlanSlp Slp(VPIAI, *Body); + SmallVector StoreRoot = {Store1, Store2}; + EXPECT_EQ(nullptr, Slp.buildGraph(StoreRoot)); + + // FIXME Need to select better first value for lane0. + EXPECT_FALSE(Slp.isCompletelySLP()); +} + +TEST_F(VPlanSlpTest, testSlpReorder_4) { + LLVMContext Ctx; + const char *ModuleString = + "%struct.Test = type { i32, i32 }\n" + "define void @add_x3(%struct.Test* %A, %struct.Test* %B, %struct.Test* " + "%C, %struct.Test* %D, %struct.Test* %E) {\n" + "entry:\n" + " br label %for.body\n" + "for.body: ; preds = %for.body, " + "%entry\n" + " %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n" + " %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 0\n" + " %vA0 = load i32, i32* %A0, align 4\n" + " %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + "%indvars.iv, i32 0\n" + " %vB0 = load i32, i32* %B0, align 4\n" + " %mul11 = mul nsw i32 %vA0, %vB0\n" + " %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 0\n" + " %vC0 = load i32, i32* %C0, align 4\n" + " %D0 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 " + "%indvars.iv, i32 0\n" + " %vD0 = load i32, i32* %D0, align 4\n" + " %mul12 = mul nsw i32 %vC0, %vD0\n" + " %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 1\n" + " %vA1 = load i32, i32* %A1, align 4\n" + " %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + "%indvars.iv, i32 1\n" + " %vB1 = load i32, i32* %B1, align 4\n" + " %mul21 = mul nsw i32 %vA1, %vB1\n" + " %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 1\n" + " %vC1 = load i32, i32* %C1, align 4\n" + " %D1 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 " + "%indvars.iv, i32 1\n" + " %vD1 = load i32, i32* %D1, align 4\n" + " %mul22 = mul nsw i32 %vC1, %vD1\n" + " %add1 = add nsw i32 %mul11, %mul12\n" + " %add2 = add nsw i32 %mul22, %mul21\n" + " %E0 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 " + "%indvars.iv, i32 0\n" + " store i32 %add1, i32* %E0, align 4\n" + " %E1 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 " + "%indvars.iv, i32 1\n" + " store i32 %add2, i32* %E1, align 4\n" + " %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n" + " %exitcond = icmp eq i64 %indvars.iv.next, 1024\n" + " br i1 %exitcond, label %for.cond.cleanup, label %for.body\n" + "for.cond.cleanup: ; preds = %for.body\n" + " ret void\n" + "}\n"; + + Module &M = parseModule(ModuleString); + + Function *F = M.getFunction("add_x3"); + BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor(); + auto Plan = buildHCFG(LoopHeader); + + VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock(); + EXPECT_NE(nullptr, Entry->getSingleSuccessor()); + VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock(); + + VPInstruction *Store1 = cast(&*std::next(Body->begin(), 24)); + VPInstruction *Store2 = cast(&*std::next(Body->begin(), 26)); + + checkReorderExample( + Store1, Store2, Body, + getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan)); +} + +// Make sure we do not combine instructions with operands in different BBs. +TEST_F(VPlanSlpTest, testInstrsInDifferentBBs) { + const char *ModuleString = + "%struct.Test = type { i32, i32 }\n" + "%struct.Test3 = type { i32, i32, i32 }\n" + "%struct.Test4xi8 = type { i8, i8, i8 }\n" + "define void @add_x2(%struct.Test* nocapture readonly %A, %struct.Test* " + "nocapture readonly %B, %struct.Test* nocapture %C) {\n" + "entry:\n" + " br label %for.body\n" + "for.body: ; preds = %for.body, " + "%entry\n" + " %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n" + " %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 0\n" + " %vA0 = load i32, i32* %A0, align 4\n" + " %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + "%indvars.iv, i32 0\n" + " %vB0 = load i32, i32* %B0, align 4\n" + " %add0 = add nsw i32 %vA0, %vB0\n" + " %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 1\n" + " %vA1 = load i32, i32* %A1, align 4\n" + " %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + "%indvars.iv, i32 1\n" + " br label %bb2\n" + "bb2:\n" + " %vB1 = load i32, i32* %B1, align 4\n" + " %add1 = add nsw i32 %vA1, %vB1\n" + " %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 0\n" + " store i32 %add0, i32* %C0, align 4\n" + " %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 1\n" + " store i32 %add1, i32* %C1, align 4\n" + " %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n" + " %exitcond = icmp eq i64 %indvars.iv.next, 1024\n" + " br i1 %exitcond, label %for.cond.cleanup, label %for.body\n" + "for.cond.cleanup: ; preds = %for.body\n" + " ret void\n" + "}\n"; + + Module &M = parseModule(ModuleString); + + Function *F = M.getFunction("add_x2"); + BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor(); + auto Plan = buildHCFG(LoopHeader); + auto VPIAI = getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan); + + VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock(); + EXPECT_NE(nullptr, Entry->getSingleSuccessor()); + VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock(); + VPBasicBlock *BB2 = Body->getSingleSuccessor()->getEntryBasicBlock(); + + VPInstruction *Store1 = cast(&*std::next(BB2->begin(), 3)); + VPInstruction *Store2 = cast(&*std::next(BB2->begin(), 5)); + + VPlanSlp Slp(VPIAI, *BB2); + SmallVector StoreRoot = {Store1, Store2}; + EXPECT_EQ(nullptr, Slp.buildGraph(StoreRoot)); + EXPECT_EQ(0u, Slp.getWidestBundleBits()); +} + +// Make sure we do not combine instructions with operands in different BBs. +TEST_F(VPlanSlpTest, testInstrsInDifferentBBs2) { + const char *ModuleString = + "%struct.Test = type { i32, i32 }\n" + "%struct.Test3 = type { i32, i32, i32 }\n" + "%struct.Test4xi8 = type { i8, i8, i8 }\n" + "define void @add_x2(%struct.Test* nocapture readonly %A, %struct.Test* " + "nocapture readonly %B, %struct.Test* nocapture %C) {\n" + "entry:\n" + " br label %for.body\n" + "for.body: ; preds = %for.body, " + "%entry\n" + " %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n" + " %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 0\n" + " %vA0 = load i32, i32* %A0, align 4\n" + " %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + "%indvars.iv, i32 0\n" + " %vB0 = load i32, i32* %B0, align 4\n" + " %add0 = add nsw i32 %vA0, %vB0\n" + " %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 1\n" + " %vA1 = load i32, i32* %A1, align 4\n" + " %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + "%indvars.iv, i32 1\n" + " %vB1 = load i32, i32* %B1, align 4\n" + " %add1 = add nsw i32 %vA1, %vB1\n" + " br label %bb2\n" + "bb2:\n" + " %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 0\n" + " store i32 %add0, i32* %C0, align 4\n" + " %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 1\n" + " store i32 %add1, i32* %C1, align 4\n" + " %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n" + " %exitcond = icmp eq i64 %indvars.iv.next, 1024\n" + " br i1 %exitcond, label %for.cond.cleanup, label %for.body\n" + "for.cond.cleanup: ; preds = %for.body\n" + " ret void\n" + "}\n"; + + Module &M = parseModule(ModuleString); + + Function *F = M.getFunction("add_x2"); + BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor(); + auto Plan = buildHCFG(LoopHeader); + auto VPIAI = getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan); + + VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock(); + EXPECT_NE(nullptr, Entry->getSingleSuccessor()); + VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock(); + VPBasicBlock *BB2 = Body->getSingleSuccessor()->getEntryBasicBlock(); + + VPInstruction *Store1 = cast(&*std::next(BB2->begin(), 1)); + VPInstruction *Store2 = cast(&*std::next(BB2->begin(), 3)); + + VPlanSlp Slp(VPIAI, *BB2); + SmallVector StoreRoot = {Store1, Store2}; + EXPECT_EQ(nullptr, Slp.buildGraph(StoreRoot)); + EXPECT_EQ(0u, Slp.getWidestBundleBits()); +} + +TEST_F(VPlanSlpTest, testSlpAtomicLoad) { + const char *ModuleString = + "%struct.Test = type { i32, i32 }\n" + "%struct.Test3 = type { i32, i32, i32 }\n" + "%struct.Test4xi8 = type { i8, i8, i8 }\n" + "define void @add_x2(%struct.Test* nocapture readonly %A, %struct.Test* " + "nocapture readonly %B, %struct.Test* nocapture %C) {\n" + "entry:\n" + " br label %for.body\n" + "for.body: ; preds = %for.body, " + "%entry\n" + " %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n" + " %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 0\n" + " %vA0 = load atomic i32, i32* %A0 monotonic, align 4\n" + " %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + "%indvars.iv, i32 0\n" + " %vB0 = load i32, i32* %B0, align 4\n" + " %add0 = add nsw i32 %vA0, %vB0\n" + " %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 1\n" + " %vA1 = load i32, i32* %A1, align 4\n" + " %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + "%indvars.iv, i32 1\n" + " %vB1 = load i32, i32* %B1, align 4\n" + " %add1 = add nsw i32 %vA1, %vB1\n" + " %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 0\n" + " store i32 %add0, i32* %C0, align 4\n" + " %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 1\n" + " store i32 %add1, i32* %C1, align 4\n" + " %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n" + " %exitcond = icmp eq i64 %indvars.iv.next, 1024\n" + " br i1 %exitcond, label %for.cond.cleanup, label %for.body\n" + "for.cond.cleanup: ; preds = %for.body\n" + " ret void\n" + "}\n"; + + Module &M = parseModule(ModuleString); + + Function *F = M.getFunction("add_x2"); + BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor(); + auto Plan = buildHCFG(LoopHeader); + auto VPIAI = getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan); + + VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock(); + EXPECT_NE(nullptr, Entry->getSingleSuccessor()); + VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock(); + + VPInstruction *Store1 = cast(&*std::next(Body->begin(), 12)); + VPInstruction *Store2 = cast(&*std::next(Body->begin(), 14)); + + VPlanSlp Slp(VPIAI, *Body); + SmallVector StoreRoot = {Store1, Store2}; + EXPECT_EQ(nullptr, Slp.buildGraph(StoreRoot)); + EXPECT_FALSE(Slp.isCompletelySLP()); +} + +TEST_F(VPlanSlpTest, testSlpAtomicStore) { + const char *ModuleString = + "%struct.Test = type { i32, i32 }\n" + "%struct.Test3 = type { i32, i32, i32 }\n" + "%struct.Test4xi8 = type { i8, i8, i8 }\n" + "define void @add_x2(%struct.Test* nocapture readonly %A, %struct.Test* " + "nocapture readonly %B, %struct.Test* nocapture %C) {\n" + "entry:\n" + " br label %for.body\n" + "for.body: ; preds = %for.body, " + "%entry\n" + " %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n" + " %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 0\n" + " %vA0 = load i32, i32* %A0, align 4\n" + " %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + "%indvars.iv, i32 0\n" + " %vB0 = load i32, i32* %B0, align 4\n" + " %add0 = add nsw i32 %vA0, %vB0\n" + " %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 " + "%indvars.iv, i32 1\n" + " %vA1 = load i32, i32* %A1, align 4\n" + " %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 " + "%indvars.iv, i32 1\n" + " %vB1 = load i32, i32* %B1, align 4\n" + " %add1 = add nsw i32 %vA1, %vB1\n" + " %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 0\n" + " store atomic i32 %add0, i32* %C0 monotonic, align 4\n" + " %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 " + "%indvars.iv, i32 1\n" + " store i32 %add1, i32* %C1, align 4\n" + " %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n" + " %exitcond = icmp eq i64 %indvars.iv.next, 1024\n" + " br i1 %exitcond, label %for.cond.cleanup, label %for.body\n" + "for.cond.cleanup: ; preds = %for.body\n" + " ret void\n" + "}\n"; + + Module &M = parseModule(ModuleString); + + Function *F = M.getFunction("add_x2"); + BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor(); + auto Plan = buildHCFG(LoopHeader); + auto VPIAI = getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan); + + VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock(); + EXPECT_NE(nullptr, Entry->getSingleSuccessor()); + VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock(); + + VPInstruction *Store1 = cast(&*std::next(Body->begin(), 12)); + VPInstruction *Store2 = cast(&*std::next(Body->begin(), 14)); + + VPlanSlp Slp(VPIAI, *Body); + SmallVector StoreRoot = {Store1, Store2}; + Slp.buildGraph(StoreRoot); + EXPECT_FALSE(Slp.isCompletelySLP()); +} + +} // namespace +} // namespace llvm -- 2.7.4