From 09e516c54b0c5ec3fe6b3ef6c70d9e09e89abc95 Mon Sep 17 00:00:00 2001
From: Florian Hahn <florian.hahn@arm.com>
Date: Wed, 14 Nov 2018 13:11:49 +0000
Subject: [PATCH] [VPlan, SLP] Add simple SLP analysis on top of VPlan.
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

This patch adds an initial implementation of the look-ahead SLP tree
construction described in 'Look-Ahead SLP: Auto-vectorization in the Presence
of Commutative Operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
LuÃ­s F. W. GÃ³es'.

It returns an SLP tree represented as VPInstructions, with combined
instructions represented as a single, wider VPInstruction.

This initial version does not support instructions with multiple
different users (either inside or outside the SLP tree) or
non-instruction operands; it won't generate any shuffles or
insertelement instructions.

It also just adds the analysis that builds an SLP tree rooted in a set
of stores. It does not include any cost modeling or memory legality
checks. The plan is to integrate it with VPlan based cost modeling, once
available and to only apply it to operations that can be widened.

A follow-up patch will add a support for replacing instructions in a
VPlan with their SLP counter parts.

Reviewers: Ayal, mssimpso, rengolin, mkuper, hfinkel, hsaito, dcaballe, vporpo, RKSimon, ABataev

Reviewed By: rengolin

Differential Revision: https://reviews.llvm.org/D4949

llvm-svn: 346857
---
 llvm/lib/Transforms/Vectorize/CMakeLists.txt       |   1 +
 llvm/lib/Transforms/Vectorize/VPlan.cpp            |  13 +
 llvm/lib/Transforms/Vectorize/VPlan.h              | 125 ++-
 llvm/lib/Transforms/Vectorize/VPlanSLP.cpp         | 469 +++++++++++
 llvm/lib/Transforms/Vectorize/VPlanValue.h         |  16 +
 llvm/unittests/Transforms/Vectorize/CMakeLists.txt |   1 +
 .../Transforms/Vectorize/VPlanSlpTest.cpp          | 899 +++++++++++++++++++++
 7 files changed, 1523 insertions(+), 1 deletion(-)
 create mode 100644 llvm/lib/Transforms/Vectorize/VPlanSLP.cpp
 create mode 100644 llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp
diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
index 27a4d24..06eaadf 100644
--- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
@@ -7,6 +7,7 @@ add_llvm_library(LLVMVectorize
   VPlan.cpp
   VPlanHCFGBuilder.cpp
   VPlanHCFGTransforms.cpp
+  VPlanSLP.cpp
   VPlanVerifier.cpp
 
   ADDITIONAL_HEADER_DIRS
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 541378de..dad733b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -338,6 +338,12 @@ void VPInstruction::print(raw_ostream &O) const {
   case VPInstruction::ICmpULE:
     O << "icmp ule";
     break;
+  case VPInstruction::SLPLoad:
+    O << "combined load";
+    break;
+  case VPInstruction::SLPStore:
+    O << "combined store";
+    break;
   default:
     O << Instruction::getOpcodeName(getOpcode());
   }
@@ -681,6 +687,13 @@ void VPWidenMemoryInstructionRecipe::print(raw_ostream &O,
 
 template void DomTreeBuilder::Calculate<VPDominatorTree>(VPDominatorTree &DT);
 
+void VPValue::replaceAllUsesWith(VPValue *New) {
+  for (VPUser *User : users())
+    for (unsigned I = 0, E = User->getNumOperands(); I < E; ++I)
+      if (User->getOperand(I) == this)
+        User->setOperand(I, New);
+}
+
 void VPInterleavedAccessInfo::visitRegion(VPRegionBlock *Region,
                                           Old2NewTy &Old2New,
                                           InterleavedAccessInfo &IAI) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 6633322..dd50db3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -60,6 +60,7 @@ class Value;
 class VPBasicBlock;
 class VPRegionBlock;
 class VPlan;
+class VPlanSlp;
 
 /// A range of powers-of-2 vectorization factors with fixed start and
 /// adjustable end. The range includes start and excludes end, e.g.,:
@@ -609,10 +610,16 @@ public:
 /// the VPInstruction is also a single def-use vertex.
 class VPInstruction : public VPUser, public VPRecipeBase {
   friend class VPlanHCFGTransforms;
+  friend class VPlanSlp;
 
 public:
   /// VPlan opcodes, extending LLVM IR with idiomatics instructions.
-  enum { Not = Instruction::OtherOpsEnd + 1, ICmpULE };
+  enum {
+    Not = Instruction::OtherOpsEnd + 1,
+    ICmpULE,
+    SLPLoad,
+    SLPStore,
+  };
 
 private:
   typedef unsigned char OpcodeTy;
@@ -622,6 +629,13 @@ private:
   /// modeled instruction.
   void generateInstruction(VPTransformState &State, unsigned Part);
 
+protected:
+  Instruction *getUnderlyingInstr() {
+    return cast_or_null<Instruction>(getUnderlyingValue());
+  }
+
+  void setUnderlyingInstr(Instruction *I) { setUnderlyingValue(I); }
+
 public:
   VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands)
       : VPUser(VPValue::VPInstructionSC, Operands),
@@ -635,6 +649,11 @@ public:
     return V->getVPValueID() == VPValue::VPInstructionSC;
   }
 
+  VPInstruction *clone() const {
+    SmallVector<VPValue *, 2> Operands(operands());
+    return new VPInstruction(Opcode, Operands);
+  }
+
   /// Method to support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const VPRecipeBase *R) {
     return R->getVPRecipeID() == VPRecipeBase::VPInstructionSC;
@@ -652,6 +671,14 @@ public:
 
   /// Print the VPInstruction.
   void print(raw_ostream &O) const;
+
+  /// Return true if this instruction may modify memory.
+  bool mayWriteToMemory() const {
+    // TODO: we can use attributes of the called function to rule out memory
+    //       modifications.
+    return Opcode == Instruction::Store || Opcode == Instruction::Call ||
+           Opcode == Instruction::Invoke || Opcode == SLPStore;
+  }
 };
 
 /// VPWidenRecipe is a recipe for producing a copy of vector type for each
@@ -1508,6 +1535,102 @@ public:
   }
 };
 
+/// Class that maps (parts of) an existing VPlan to trees of combined
+/// VPInstructions.
+class VPlanSlp {
+private:
+  enum class OpMode { Failed, Load, Opcode };
+
+  /// A DenseMapInfo implementation for using SmallVector<VPValue *, 4> as
+  /// DenseMap keys.
+  struct BundleDenseMapInfo {
+    static SmallVector<VPValue *, 4> getEmptyKey() {
+      return {reinterpret_cast<VPValue *>(-1)};
+    }
+
+    static SmallVector<VPValue *, 4> getTombstoneKey() {
+      return {reinterpret_cast<VPValue *>(-2)};
+    }
+
+    static unsigned getHashValue(const SmallVector<VPValue *, 4> &V) {
+      return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
+    }
+
+    static bool isEqual(const SmallVector<VPValue *, 4> &LHS,
+                        const SmallVector<VPValue *, 4> &RHS) {
+      return LHS == RHS;
+    }
+  };
+
+  /// Mapping of values in the original VPlan to a combined VPInstruction.
+  DenseMap<SmallVector<VPValue *, 4>, VPInstruction *, BundleDenseMapInfo>
+      BundleToCombined;
+
+  VPInterleavedAccessInfo &IAI;
+
+  /// Basic block to operate on. For now, only instructions in a single BB are
+  /// considered.
+  const VPBasicBlock &BB;
+
+  /// Indicates whether we managed to combine all visited instructions or not.
+  bool CompletelySLP = true;
+
+  /// Width of the widest combined bundle in bits.
+  unsigned WidestBundleBits = 0;
+
+  using MultiNodeOpTy =
+      typename std::pair<VPInstruction *, SmallVector<VPValue *, 4>>;
+
+  // Input operand bundles for the current multi node. Each multi node operand
+  // bundle contains values not matching the multi node's opcode. They will
+  // be reordered in reorderMultiNodeOps, once we completed building a
+  // multi node.
+  SmallVector<MultiNodeOpTy, 4> MultiNodeOps;
+
+  /// Indicates whether we are building a multi node currently.
+  bool MultiNodeActive = false;
+
+  /// Check if we can vectorize Operands together.
+  bool areVectorizable(ArrayRef<VPValue *> Operands) const;
+
+  /// Add combined instruction \p New for the bundle \p Operands.
+  void addCombined(ArrayRef<VPValue *> Operands, VPInstruction *New);
+
+  /// Indicate we hit a bundle we failed to combine. Returns nullptr for now.
+  VPInstruction *markFailed();
+
+  /// Reorder operands in the multi node to maximize sequential memory access
+  /// and commutative operations.
+  SmallVector<MultiNodeOpTy, 4> reorderMultiNodeOps();
+
+  /// Choose the best candidate to use for the lane after \p Last. The set of
+  /// candidates to choose from are values with an opcode matching \p Last's
+  /// or loads consecutive to \p Last.
+  std::pair<OpMode, VPValue *> getBest(OpMode Mode, VPValue *Last,
+                                       SmallVectorImpl<VPValue *> &Candidates,
+                                       VPInterleavedAccessInfo &IAI);
+
+  /// Print bundle \p Values to dbgs().
+  void dumpBundle(ArrayRef<VPValue *> Values);
+
+public:
+  VPlanSlp(VPInterleavedAccessInfo &IAI, VPBasicBlock &BB) : IAI(IAI), BB(BB) {}
+
+  ~VPlanSlp() {
+    for (auto &KV : BundleToCombined)
+      delete KV.second;
+  }
+
+  /// Tries to build an SLP tree rooted at \p Operands and returns a
+  /// VPInstruction combining \p Operands, if they can be combined.
+  VPInstruction *buildGraph(ArrayRef<VPValue *> Operands);
+
+  /// Return the width of the widest combined bundle in bits.
+  unsigned getWidestBundleBits() const { return WidestBundleBits; }
+
+  /// Return true if all visited instruction can be combined.
+  bool isCompletelySLP() const { return CompletelySLP; }
+};
 } // end namespace llvm
 
 #endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
diff --git a/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp b/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp
new file mode 100644
index 0000000..679fb51
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp
@@ -0,0 +1,469 @@
+//===- VPlanSLP.cpp - SLP Analysis based on VPlan -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// This file implements SLP analysis based on VPlan. The analysis is based on
+/// the ideas described in
+///
+///   Look-ahead SLP: auto-vectorization in the presence of commutative
+///   operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
+///   LuÃ­s F. W. GÃ³es
+///
+//===----------------------------------------------------------------------===//
+
+#include "VPlan.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <cassert>
+#include <iterator>
+#include <string>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "vplan-slp"
+
+// Number of levels to look ahead when re-ordering multi node operands.
+static unsigned LookaheadMaxDepth = 5;
+
+VPInstruction *VPlanSlp::markFailed() {
+  // FIXME: Currently this is used to signal we hit instructions we cannot
+  //        trivially SLP'ize.
+  CompletelySLP = false;
+  return nullptr;
+}
+
+void VPlanSlp::addCombined(ArrayRef<VPValue *> Operands, VPInstruction *New) {
+  if (all_of(Operands, [](VPValue *V) {
+        return cast<VPInstruction>(V)->getUnderlyingInstr();
+      })) {
+    unsigned BundleSize = 0;
+    for (VPValue *V : Operands) {
+      Type *T = cast<VPInstruction>(V)->getUnderlyingInstr()->getType();
+      assert(!T->isVectorTy() && "Only scalar types supported for now");
+      BundleSize += T->getScalarSizeInBits();
+    }
+    WidestBundleBits = std::max(WidestBundleBits, BundleSize);
+  }
+
+  auto Res = BundleToCombined.try_emplace(to_vector<4>(Operands), New);
+  assert(Res.second &&
+         "Already created a combined instruction for the operand bundle");
+  (void)Res;
+}
+
+bool VPlanSlp::areVectorizable(ArrayRef<VPValue *> Operands) const {
+  // Currently we only support VPInstructions.
+  if (!all_of(Operands, [](VPValue *Op) {
+        return Op && isa<VPInstruction>(Op) &&
+               cast<VPInstruction>(Op)->getUnderlyingInstr();
+      })) {
+    LLVM_DEBUG(dbgs() << "VPSLP: not all operands are VPInstructions\n");
+    return false;
+  }
+
+  // Check if opcodes and type width agree for all instructions in the bundle.
+  // FIXME: Differing widths/opcodes can be handled by inserting additional
+  //        instructions.
+  // FIXME: Deal with non-primitive types.
+  const Instruction *OriginalInstr =
+      cast<VPInstruction>(Operands[0])->getUnderlyingInstr();
+  unsigned Opcode = OriginalInstr->getOpcode();
+  unsigned Width = OriginalInstr->getType()->getPrimitiveSizeInBits();
+  if (!all_of(Operands, [Opcode, Width](VPValue *Op) {
+        const Instruction *I = cast<VPInstruction>(Op)->getUnderlyingInstr();
+        return I->getOpcode() == Opcode &&
+               I->getType()->getPrimitiveSizeInBits() == Width;
+      })) {
+    LLVM_DEBUG(dbgs() << "VPSLP: Opcodes do not agree \n");
+    return false;
+  }
+
+  // For now, all operands must be defined in the same BB.
+  if (any_of(Operands, [this](VPValue *Op) {
+        return cast<VPInstruction>(Op)->getParent() != &this->BB;
+      })) {
+    LLVM_DEBUG(dbgs() << "VPSLP: operands in different BBs\n");
+    return false;
+  }
+
+  if (any_of(Operands,
+             [](VPValue *Op) { return Op->hasMoreThanOneUniqueUser(); })) {
+    LLVM_DEBUG(dbgs() << "VPSLP: Some operands have multiple users.\n");
+    return false;
+  }
+
+  // For loads, check that there are no instructions writing to memory in
+  // between them.
+  // TODO: we only have to forbid instructions writing to memory that could
+  //       interfere with any of the loads in the bundle
+  if (Opcode == Instruction::Load) {
+    unsigned LoadsSeen = 0;
+    VPBasicBlock *Parent = cast<VPInstruction>(Operands[0])->getParent();
+    for (auto &I : *Parent) {
+      auto *VPI = cast<VPInstruction>(&I);
+      if (VPI->getOpcode() == Instruction::Load &&
+          std::find(Operands.begin(), Operands.end(), VPI) != Operands.end())
+        LoadsSeen++;
+
+      if (LoadsSeen == Operands.size())
+        break;
+      if (LoadsSeen > 0 && VPI->mayWriteToMemory()) {
+        LLVM_DEBUG(
+            dbgs() << "VPSLP: instruction modifying memory between loads\n");
+        return false;
+      }
+    }
+
+    if (!all_of(Operands, [](VPValue *Op) {
+          return cast<LoadInst>(cast<VPInstruction>(Op)->getUnderlyingInstr())
+              ->isSimple();
+        })) {
+      LLVM_DEBUG(dbgs() << "VPSLP: only simple loads are supported.\n");
+      return false;
+    }
+  }
+
+  if (Opcode == Instruction::Store)
+    if (!all_of(Operands, [](VPValue *Op) {
+          return cast<StoreInst>(cast<VPInstruction>(Op)->getUnderlyingInstr())
+              ->isSimple();
+        })) {
+      LLVM_DEBUG(dbgs() << "VPSLP: only simple stores are supported.\n");
+      return false;
+    }
+
+  return true;
+}
+
+static SmallVector<VPValue *, 4> getOperands(ArrayRef<VPValue *> Values,
+                                             unsigned OperandIndex) {
+  SmallVector<VPValue *, 4> Operands;
+  for (VPValue *V : Values) {
+    auto *U = cast<VPUser>(V);
+    Operands.push_back(U->getOperand(OperandIndex));
+  }
+  return Operands;
+}
+
+static bool areCommutative(ArrayRef<VPValue *> Values) {
+  return Instruction::isCommutative(
+      cast<VPInstruction>(Values[0])->getOpcode());
+}
+
+static SmallVector<SmallVector<VPValue *, 4>, 4>
+getOperands(ArrayRef<VPValue *> Values) {
+  SmallVector<SmallVector<VPValue *, 4>, 4> Result;
+  auto *VPI = cast<VPInstruction>(Values[0]);
+
+  switch (VPI->getOpcode()) {
+  case Instruction::Load:
+    llvm_unreachable("Loads terminate a tree, no need to get operands");
+  case Instruction::Store:
+    Result.push_back(getOperands(Values, 0));
+    break;
+  default:
+    for (unsigned I = 0, NumOps = VPI->getNumOperands(); I < NumOps; ++I)
+      Result.push_back(getOperands(Values, I));
+    break;
+  }
+
+  return Result;
+}
+
+/// Returns the opcode of Values or ~0 if they do not all agree.
+static Optional<unsigned> getOpcode(ArrayRef<VPValue *> Values) {
+  unsigned Opcode = cast<VPInstruction>(Values[0])->getOpcode();
+  if (any_of(Values, [Opcode](VPValue *V) {
+        return cast<VPInstruction>(V)->getOpcode() != Opcode;
+      }))
+    return None;
+  return {Opcode};
+}
+
+/// Returns true if A and B access sequential memory if they are loads or
+/// stores or if they have identical opcodes otherwise.
+static bool areConsecutiveOrMatch(VPInstruction *A, VPInstruction *B,
+                                  VPInterleavedAccessInfo &IAI) {
+  if (A->getOpcode() != B->getOpcode())
+    return false;
+
+  if (A->getOpcode() != Instruction::Load &&
+      A->getOpcode() != Instruction::Store)
+    return true;
+  auto *GA = IAI.getInterleaveGroup(A);
+  auto *GB = IAI.getInterleaveGroup(B);
+
+  return GA && GB && GA == GB && GA->getIndex(A) + 1 == GB->getIndex(B);
+}
+
+/// Implements getLAScore from Listing 7 in the paper.
+/// Traverses and compares operands of V1 and V2 to MaxLevel.
+static unsigned getLAScore(VPValue *V1, VPValue *V2, unsigned MaxLevel,
+                           VPInterleavedAccessInfo &IAI) {
+  if (!isa<VPInstruction>(V1) || !isa<VPInstruction>(V2))
+    return 0;
+
+  if (MaxLevel == 0)
+    return (unsigned)areConsecutiveOrMatch(cast<VPInstruction>(V1),
+                                           cast<VPInstruction>(V2), IAI);
+
+  unsigned Score = 0;
+  for (unsigned I = 0, EV1 = cast<VPUser>(V1)->getNumOperands(); I < EV1; ++I)
+    for (unsigned J = 0, EV2 = cast<VPUser>(V2)->getNumOperands(); J < EV2; ++J)
+      Score += getLAScore(cast<VPUser>(V1)->getOperand(I),
+                          cast<VPUser>(V2)->getOperand(J), MaxLevel - 1, IAI);
+  return Score;
+}
+
+std::pair<VPlanSlp::OpMode, VPValue *>
+VPlanSlp::getBest(OpMode Mode, VPValue *Last,
+                  SmallVectorImpl<VPValue *> &Candidates,
+                  VPInterleavedAccessInfo &IAI) {
+  LLVM_DEBUG(dbgs() << "      getBest\n");
+  VPValue *Best = Candidates[0];
+  SmallVector<VPValue *, 4> BestCandidates;
+
+  LLVM_DEBUG(dbgs() << "        Candidates  for "
+                    << *cast<VPInstruction>(Last)->getUnderlyingInstr() << " ");
+  for (auto *Candidate : Candidates) {
+    auto *LastI = cast<VPInstruction>(Last);
+    auto *CandidateI = cast<VPInstruction>(Candidate);
+    if (areConsecutiveOrMatch(LastI, CandidateI, IAI)) {
+      LLVM_DEBUG(dbgs() << *cast<VPInstruction>(Candidate)->getUnderlyingInstr()
+                        << " ");
+      BestCandidates.push_back(Candidate);
+    }
+  }
+  LLVM_DEBUG(dbgs() << "\n");
+
+  if (BestCandidates.empty())
+    return {OpMode::Failed, nullptr};
+
+  if (BestCandidates.size() == 1)
+    return {Mode, BestCandidates[0]};
+
+  if (Mode == OpMode::Opcode) {
+    unsigned BestScore = 0;
+    for (unsigned Depth = 1; Depth < LookaheadMaxDepth; Depth++) {
+      unsigned PrevScore = ~0u;
+      bool AllSame = true;
+
+      // FIXME: Avoid visiting the same operands multiple times.
+      for (auto *Candidate : BestCandidates) {
+        unsigned Score = getLAScore(Last, Candidate, Depth, IAI);
+        if (PrevScore == ~0u)
+          PrevScore = Score;
+        if (PrevScore != Score)
+          AllSame = false;
+        PrevScore = Score;
+
+        if (Score > BestScore) {
+          BestScore = Score;
+          Best = Candidate;
+        }
+      }
+      if (!AllSame)
+        break;
+    }
+  }
+  LLVM_DEBUG(dbgs() << "Found best "
+                    << *cast<VPInstruction>(Best)->getUnderlyingInstr()
+                    << "\n");
+  std::remove(Candidates.begin(), Candidates.end(), Best);
+
+  return {Mode, Best};
+}
+
+SmallVector<VPlanSlp::MultiNodeOpTy, 4> VPlanSlp::reorderMultiNodeOps() {
+  SmallVector<MultiNodeOpTy, 4> FinalOrder;
+  SmallVector<OpMode, 4> Mode;
+  FinalOrder.reserve(MultiNodeOps.size());
+  Mode.reserve(MultiNodeOps.size());
+
+  LLVM_DEBUG(dbgs() << "Reordering multinode\n");
+
+  for (auto &Operands : MultiNodeOps) {
+    FinalOrder.push_back({Operands.first, {Operands.second[0]}});
+    if (cast<VPInstruction>(Operands.second[0])->getOpcode() ==
+        Instruction::Load)
+      Mode.push_back(OpMode::Load);
+    else
+      Mode.push_back(OpMode::Opcode);
+  }
+
+  for (unsigned Lane = 1, E = MultiNodeOps[0].second.size(); Lane < E; ++Lane) {
+    LLVM_DEBUG(dbgs() << "  Finding best value for lane " << Lane << "\n");
+    SmallVector<VPValue *, 4> Candidates;
+    Candidates.reserve(MultiNodeOps.size());
+    LLVM_DEBUG(dbgs() << "  Candidates  ");
+    for (auto Ops : MultiNodeOps) {
+      LLVM_DEBUG(
+          dbgs() << *cast<VPInstruction>(Ops.second[Lane])->getUnderlyingInstr()
+                 << " ");
+      Candidates.push_back(Ops.second[Lane]);
+    }
+    LLVM_DEBUG(dbgs() << "\n");
+
+    for (unsigned Op = 0, E = MultiNodeOps.size(); Op < E; ++Op) {
+      LLVM_DEBUG(dbgs() << "  Checking " << Op << "\n");
+      if (Mode[Op] == OpMode::Failed)
+        continue;
+
+      VPValue *Last = FinalOrder[Op].second[Lane - 1];
+      std::pair<OpMode, VPValue *> Res =
+          getBest(Mode[Op], Last, Candidates, IAI);
+      if (Res.second)
+        FinalOrder[Op].second.push_back(Res.second);
+      else
+        // TODO: handle this case
+        FinalOrder[Op].second.push_back(markFailed());
+    }
+  }
+
+  return FinalOrder;
+}
+
+void VPlanSlp::dumpBundle(ArrayRef<VPValue *> Values) {
+  LLVM_DEBUG(dbgs() << " Ops: ");
+  for (auto Op : Values)
+    if (auto *Instr = cast_or_null<VPInstruction>(Op)->getUnderlyingInstr())
+      LLVM_DEBUG(dbgs() << *Instr << " | ");
+    else
+      LLVM_DEBUG(dbgs() << " nullptr | ");
+  LLVM_DEBUG(dbgs() << "\n");
+}
+
+VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) {
+  assert(!Values.empty() && "Need some operands!");
+
+  // If we already visited this instruction bundle, re-use the existing node
+  auto I = BundleToCombined.find(to_vector<4>(Values));
+  if (I != BundleToCombined.end()) {
+#ifdef NDEBUG
+    // Check that the resulting graph is a tree. If we re-use a node, this means
+    // its values have multiple users. We only allow this, if all users of each
+    // value are the same instruction.
+    for (auto *V : Values) {
+      auto UI = V->user_begin();
+      auto *FirstUser = *UI++;
+      while (UI != V->use_end()) {
+        assert(*UI == FirstUser && "Currently we only support SLP trees.");
+        UI++;
+      }
+    }
+#endif
+    return I->second;
+  }
+
+  // Dump inputs
+  LLVM_DEBUG({
+    dbgs() << "buildGraph: ";
+    dumpBundle(Values);
+  });
+
+  if (!areVectorizable(Values))
+    return markFailed();
+
+  assert(getOpcode(Values) && "Opcodes for all values must match");
+  unsigned ValuesOpcode = getOpcode(Values).getValue();
+
+  SmallVector<VPValue *, 4> CombinedOperands;
+  if (areCommutative(Values)) {
+    bool MultiNodeRoot = !MultiNodeActive;
+    MultiNodeActive = true;
+    for (auto &Operands : getOperands(Values)) {
+      LLVM_DEBUG({
+        dbgs() << "  Visiting Commutative";
+        dumpBundle(Operands);
+      });
+
+      auto OperandsOpcode = getOpcode(Operands);
+      if (OperandsOpcode && OperandsOpcode == getOpcode(Values)) {
+        LLVM_DEBUG(dbgs() << "    Same opcode, continue building\n");
+        CombinedOperands.push_back(buildGraph(Operands));
+      } else {
+        LLVM_DEBUG(dbgs() << "    Adding multinode Ops\n");
+        // Create dummy VPInstruction, which will we replace later by the
+        // re-ordered operand.
+        VPInstruction *Op = new VPInstruction(0, {});
+        CombinedOperands.push_back(Op);
+        MultiNodeOps.emplace_back(Op, Operands);
+      }
+    }
+
+    if (MultiNodeRoot) {
+      LLVM_DEBUG(dbgs() << "Reorder \n");
+      MultiNodeActive = false;
+
+      auto FinalOrder = reorderMultiNodeOps();
+
+      MultiNodeOps.clear();
+      for (auto &Ops : FinalOrder) {
+        VPInstruction *NewOp = buildGraph(Ops.second);
+        Ops.first->replaceAllUsesWith(NewOp);
+        for (unsigned i = 0; i < CombinedOperands.size(); i++)
+          if (CombinedOperands[i] == Ops.first)
+            CombinedOperands[i] = NewOp;
+        delete Ops.first;
+        Ops.first = NewOp;
+      }
+      LLVM_DEBUG(dbgs() << "Found final order\n");
+    }
+  } else {
+    LLVM_DEBUG(dbgs() << "  NonCommuntative\n");
+    if (ValuesOpcode == Instruction::Load)
+      for (VPValue *V : Values)
+        CombinedOperands.push_back(cast<VPInstruction>(V)->getOperand(0));
+    else
+      for (auto &Operands : getOperands(Values))
+        CombinedOperands.push_back(buildGraph(Operands));
+  }
+
+  unsigned Opcode;
+  switch (ValuesOpcode) {
+  case Instruction::Load:
+    Opcode = VPInstruction::SLPLoad;
+    break;
+  case Instruction::Store:
+    Opcode = VPInstruction::SLPStore;
+    break;
+  default:
+    Opcode = ValuesOpcode;
+    break;
+  }
+
+  if (!CompletelySLP)
+    return markFailed();
+
+  assert(CombinedOperands.size() > 0 && "Need more some operands");
+  auto *VPI = new VPInstruction(Opcode, CombinedOperands);
+  VPI->setUnderlyingInstr(cast<VPInstruction>(Values[0])->getUnderlyingInstr());
+
+  LLVM_DEBUG(dbgs() << "Create VPInstruction "; VPI->print(dbgs());
+             cast<VPInstruction>(Values[0])->print(dbgs()); dbgs() << "\n");
+  addCombined(Values, VPI);
+  return VPI;
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 972f0fa..b473579 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -106,6 +106,20 @@ public:
   const_user_range users() const {
     return const_user_range(user_begin(), user_end());
   }
+
+  /// Returns true if the value has more than one unique user.
+  bool hasMoreThanOneUniqueUser() {
+    if (getNumUsers() == 0)
+      return false;
+
+    // Check if all users match the first user.
+    auto Current = std::next(user_begin());
+    while (Current != user_end() && *user_begin() == *Current)
+      Current++;
+    return Current != user_end();
+  }
+
+  void replaceAllUsesWith(VPValue *New);
 };
 
 typedef DenseMap<Value *, VPValue *> Value2VPValueTy;
@@ -151,6 +165,8 @@ public:
     return Operands[N];
   }
 
+  void setOperand(unsigned I, VPValue *New) { Operands[I] = New; }
+
   typedef SmallVectorImpl<VPValue *>::iterator operand_iterator;
   typedef SmallVectorImpl<VPValue *>::const_iterator const_operand_iterator;
   typedef iterator_range<operand_iterator> operand_range;
diff --git a/llvm/unittests/Transforms/Vectorize/CMakeLists.txt b/llvm/unittests/Transforms/Vectorize/CMakeLists.txt
index 5a9142d..6e886bb 100644
--- a/llvm/unittests/Transforms/Vectorize/CMakeLists.txt
+++ b/llvm/unittests/Transforms/Vectorize/CMakeLists.txt
@@ -10,4 +10,5 @@ add_llvm_unittest(VectorizeTests
   VPlanLoopInfoTest.cpp
   VPlanTest.cpp
   VPlanHCFGTest.cpp
+  VPlanSlpTest.cpp
   )
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp
new file mode 100644
index 0000000..0381925
--- /dev/null
+++ b/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp
@@ -0,0 +1,899 @@
+//===- llvm/unittest/Transforms/Vectorize/VPlanSlpTest.cpp ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "../lib/Transforms/Vectorize/VPlan.h"
+#include "../lib/Transforms/Vectorize/VPlanHCFGBuilder.h"
+#include "../lib/Transforms/Vectorize/VPlanHCFGTransforms.h"
+#include "VPlanTestBase.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "gtest/gtest.h"
+
+namespace llvm {
+namespace {
+
+class VPlanSlpTest : public VPlanTestBase {
+protected:
+  TargetLibraryInfoImpl TLII;
+  TargetLibraryInfo TLI;
+  DataLayout DL;
+
+  std::unique_ptr<AssumptionCache> AC;
+  std::unique_ptr<ScalarEvolution> SE;
+  std::unique_ptr<AAResults> AARes;
+  std::unique_ptr<BasicAAResult> BasicAA;
+  std::unique_ptr<LoopAccessInfo> LAI;
+  std::unique_ptr<PredicatedScalarEvolution> PSE;
+  std::unique_ptr<InterleavedAccessInfo> IAI;
+
+  VPlanSlpTest()
+      : TLII(), TLI(TLII),
+        DL("e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-"
+           "f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:"
+           "16:32:64-S128") {}
+
+  VPInterleavedAccessInfo getInterleavedAccessInfo(Function &F, Loop *L,
+                                                   VPlan &Plan) {
+    AC.reset(new AssumptionCache(F));
+    SE.reset(new ScalarEvolution(F, TLI, *AC, *DT, *LI));
+    BasicAA.reset(new BasicAAResult(DL, F, TLI, *AC, &*DT, &*LI));
+    AARes.reset(new AAResults(TLI));
+    AARes->addAAResult(*BasicAA);
+    PSE.reset(new PredicatedScalarEvolution(*SE, *L));
+    LAI.reset(new LoopAccessInfo(L, &*SE, &TLI, &*AARes, &*DT, &*LI));
+    IAI.reset(new InterleavedAccessInfo(*PSE, L, &*DT, &*LI, &*LAI));
+    IAI->analyzeInterleaving(false);
+    return {Plan, *IAI};
+  }
+};
+
+TEST_F(VPlanSlpTest, testSlpSimple_2) {
+  const char *ModuleString =
+      "%struct.Test = type { i32, i32 }\n"
+      "%struct.Test3 = type { i32, i32, i32 }\n"
+      "%struct.Test4xi8 = type { i8, i8, i8 }\n"
+      "define void @add_x2(%struct.Test* nocapture readonly %A, %struct.Test* "
+      "nocapture readonly %B, %struct.Test* nocapture %C)  {\n"
+      "entry:\n"
+      "  br label %for.body\n"
+      "for.body:                                         ; preds = %for.body, "
+      "%entry\n"
+      "  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n"
+      "  %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vA0 = load i32, i32* %A0, align 4\n"
+      "  %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vB0 = load i32, i32* %B0, align 4\n"
+      "  %add0 = add nsw i32 %vA0, %vB0\n"
+      "  %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vA1 = load i32, i32* %A1, align 4\n"
+      "  %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vB1 = load i32, i32* %B1, align 4\n"
+      "  %add1 = add nsw i32 %vA1, %vB1\n"
+      "  %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 0\n"
+      "  store i32 %add0, i32* %C0, align 4\n"
+      "  %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 1\n"
+      "  store i32 %add1, i32* %C1, align 4\n"
+      "  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n"
+      "  %exitcond = icmp eq i64 %indvars.iv.next, 1024\n"
+      "  br i1 %exitcond, label %for.cond.cleanup, label %for.body\n"
+      "for.cond.cleanup:                                 ; preds = %for.body\n"
+      "  ret void\n"
+      "}\n";
+
+  Module &M = parseModule(ModuleString);
+
+  Function *F = M.getFunction("add_x2");
+  BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor();
+  auto Plan = buildHCFG(LoopHeader);
+  auto VPIAI = getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan);
+
+  VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock();
+  EXPECT_NE(nullptr, Entry->getSingleSuccessor());
+  VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock();
+
+  VPInstruction *Store1 = cast<VPInstruction>(&*std::next(Body->begin(), 12));
+  VPInstruction *Store2 = cast<VPInstruction>(&*std::next(Body->begin(), 14));
+
+  VPlanSlp Slp(VPIAI, *Body);
+  SmallVector<VPValue *, 4> StoreRoot = {Store1, Store2};
+  VPInstruction *CombinedStore = Slp.buildGraph(StoreRoot);
+  EXPECT_EQ(64u, Slp.getWidestBundleBits());
+  EXPECT_EQ(VPInstruction::SLPStore, CombinedStore->getOpcode());
+
+  auto *CombinedAdd = cast<VPInstruction>(CombinedStore->getOperand(0));
+  EXPECT_EQ(Instruction::Add, CombinedAdd->getOpcode());
+
+  auto *CombinedLoadA = cast<VPInstruction>(CombinedAdd->getOperand(0));
+  auto *CombinedLoadB = cast<VPInstruction>(CombinedAdd->getOperand(1));
+  EXPECT_EQ(VPInstruction::SLPLoad, CombinedLoadA->getOpcode());
+  EXPECT_EQ(VPInstruction::SLPLoad, CombinedLoadB->getOpcode());
+}
+
+TEST_F(VPlanSlpTest, testSlpSimple_3) {
+  const char *ModuleString =
+      "%struct.Test = type { i32, i32 }\n"
+      "%struct.Test3 = type { i32, i32, i32 }\n"
+      "%struct.Test4xi8 = type { i8, i8, i8 }\n"
+      "define void @add_x2(%struct.Test* nocapture readonly %A, %struct.Test* "
+      "nocapture readonly %B, %struct.Test* nocapture %C)  {\n"
+      "entry:\n"
+      "  br label %for.body\n"
+      "for.body:                                         ; preds = %for.body, "
+      "%entry\n"
+      "  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n"
+      "  %A0 = getelementptr %struct.Test, %struct.Test* %A, i64 "
+      "                      %indvars.iv, i32 0\n"
+      "  %vA0 = load i32, i32* %A0, align 4\n"
+      "  %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 "
+      "                      %indvars.iv, i32 0\n"
+      "  %vB0 = load i32, i32* %B0, align 4\n"
+      "  %add0 = add nsw i32 %vA0, %vB0\n"
+      "  %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "                      %indvars.iv, i32 1\n"
+      "  %vA1 = load i32, i32* %A1, align 4\n"
+      "  %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 "
+      "                      %indvars.iv, i32 1\n"
+      "  %vB1 = load i32, i32* %B1, align 4\n"
+      "  %add1 = add nsw i32 %vA1, %vB1\n"
+      "  %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "                      %indvars.iv, i32 0\n"
+      "  store i32 %add0, i32* %C0, align 4\n"
+      "  %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "                      %indvars.iv, i32 1\n"
+      "  store i32 %add1, i32* %C1, align 4\n"
+      "  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n"
+      "  %exitcond = icmp eq i64 %indvars.iv.next, 1024\n"
+      "  br i1 %exitcond, label %for.cond.cleanup, label %for.body\n"
+      "for.cond.cleanup:                                 ; preds = %for.body\n"
+      "  ret void\n"
+      "}\n";
+
+  Module &M = parseModule(ModuleString);
+
+  Function *F = M.getFunction("add_x2");
+  BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor();
+  auto Plan = buildHCFG(LoopHeader);
+
+  VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock();
+  EXPECT_NE(nullptr, Entry->getSingleSuccessor());
+  VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock();
+
+  VPInstruction *Store1 = cast<VPInstruction>(&*std::next(Body->begin(), 12));
+  VPInstruction *Store2 = cast<VPInstruction>(&*std::next(Body->begin(), 14));
+
+  auto VPIAI = getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan);
+
+  VPlanSlp Slp(VPIAI, *Body);
+  SmallVector<VPValue *, 4> StoreRoot = {Store1, Store2};
+  VPInstruction *CombinedStore = Slp.buildGraph(StoreRoot);
+  EXPECT_EQ(64u, Slp.getWidestBundleBits());
+  EXPECT_EQ(VPInstruction::SLPStore, CombinedStore->getOpcode());
+
+  auto *CombinedAdd = cast<VPInstruction>(CombinedStore->getOperand(0));
+  EXPECT_EQ(Instruction::Add, CombinedAdd->getOpcode());
+
+  auto *CombinedLoadA = cast<VPInstruction>(CombinedAdd->getOperand(0));
+  auto *CombinedLoadB = cast<VPInstruction>(CombinedAdd->getOperand(1));
+  EXPECT_EQ(VPInstruction::SLPLoad, CombinedLoadA->getOpcode());
+  EXPECT_EQ(VPInstruction::SLPLoad, CombinedLoadB->getOpcode());
+
+  VPInstruction *GetA = cast<VPInstruction>(&*std::next(Body->begin(), 1));
+  VPInstruction *GetB = cast<VPInstruction>(&*std::next(Body->begin(), 3));
+  EXPECT_EQ(GetA, CombinedLoadA->getOperand(0));
+  EXPECT_EQ(GetB, CombinedLoadB->getOperand(0));
+}
+
+TEST_F(VPlanSlpTest, testSlpReuse_1) {
+  const char *ModuleString =
+      "%struct.Test = type { i32, i32 }\n"
+      "define void @add_x2(%struct.Test* nocapture readonly %A, %struct.Test* "
+      "nocapture readonly %B, %struct.Test* nocapture %C)  {\n"
+      "entry:\n"
+      "  br label %for.body\n"
+      "for.body:                                         ; preds = %for.body, "
+      "%entry\n"
+      "  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n"
+      "  %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vA0 = load i32, i32* %A0, align 4\n"
+      "  %add0 = add nsw i32 %vA0, %vA0\n"
+      "  %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vA1 = load i32, i32* %A1, align 4\n"
+      "  %add1 = add nsw i32 %vA1, %vA1\n"
+      "  %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 0\n"
+      "  store i32 %add0, i32* %C0, align 4\n"
+      "  %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 1\n"
+      "  store i32 %add1, i32* %C1, align 4\n"
+      "  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n"
+      "  %exitcond = icmp eq i64 %indvars.iv.next, 1024\n"
+      "  br i1 %exitcond, label %for.cond.cleanup, label %for.body\n"
+      "for.cond.cleanup:                                 ; preds = %for.body\n"
+      "  ret void\n"
+      "}\n";
+
+  Module &M = parseModule(ModuleString);
+
+  Function *F = M.getFunction("add_x2");
+  BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor();
+  auto Plan = buildHCFG(LoopHeader);
+  auto VPIAI = getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan);
+
+  VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock();
+  EXPECT_NE(nullptr, Entry->getSingleSuccessor());
+  VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock();
+
+  VPInstruction *Store1 = cast<VPInstruction>(&*std::next(Body->begin(), 8));
+  VPInstruction *Store2 = cast<VPInstruction>(&*std::next(Body->begin(), 10));
+
+  VPlanSlp Slp(VPIAI, *Body);
+  SmallVector<VPValue *, 4> StoreRoot = {Store1, Store2};
+  VPInstruction *CombinedStore = Slp.buildGraph(StoreRoot);
+  EXPECT_EQ(64u, Slp.getWidestBundleBits());
+  EXPECT_EQ(VPInstruction::SLPStore, CombinedStore->getOpcode());
+
+  auto *CombinedAdd = cast<VPInstruction>(CombinedStore->getOperand(0));
+  EXPECT_EQ(Instruction::Add, CombinedAdd->getOpcode());
+
+  auto *CombinedLoadA = cast<VPInstruction>(CombinedAdd->getOperand(0));
+  EXPECT_EQ(CombinedLoadA, CombinedAdd->getOperand(1));
+  EXPECT_EQ(VPInstruction::SLPLoad, CombinedLoadA->getOpcode());
+}
+
+TEST_F(VPlanSlpTest, testSlpReuse_2) {
+  const char *ModuleString =
+      "%struct.Test = type { i32, i32 }\n"
+      "define i32 @add_x2(%struct.Test* nocapture readonly %A, %struct.Test* "
+      "nocapture readonly %B, %struct.Test* nocapture %C)  {\n"
+      "entry:\n"
+      "  br label %for.body\n"
+      "for.body:                                         ; preds = %for.body, "
+      "%entry\n"
+      "  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n"
+      "  %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vA0 = load i32, i32* %A0, align 4\n"
+      "  %add0 = add nsw i32 %vA0, %vA0\n"
+      "  %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 0\n"
+      "  store i32 %add0, i32* %C0, align 4\n"
+      "  %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vA1 = load i32, i32* %A1, align 4\n"
+      "  %add1 = add nsw i32 %vA1, %vA1\n"
+      "  %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 1\n"
+      "  store i32 %add1, i32* %C1, align 4\n"
+      "  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n"
+      "  %exitcond = icmp eq i64 %indvars.iv.next, 1024\n"
+      "  br i1 %exitcond, label %for.cond.cleanup, label %for.body\n"
+      "for.cond.cleanup:                                 ; preds = %for.body\n"
+      "  ret i32 %vA1\n"
+      "}\n";
+
+  Module &M = parseModule(ModuleString);
+
+  Function *F = M.getFunction("add_x2");
+  BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor();
+  auto Plan = buildHCFG(LoopHeader);
+  auto VPIAI = getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan);
+
+  VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock();
+  EXPECT_NE(nullptr, Entry->getSingleSuccessor());
+  VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock();
+
+  VPInstruction *Store1 = cast<VPInstruction>(&*std::next(Body->begin(), 5));
+  VPInstruction *Store2 = cast<VPInstruction>(&*std::next(Body->begin(), 10));
+
+  VPlanSlp Slp(VPIAI, *Body);
+  SmallVector<VPValue *, 4> StoreRoot = {Store1, Store2};
+  Slp.buildGraph(StoreRoot);
+  EXPECT_FALSE(Slp.isCompletelySLP());
+}
+
+static void checkReorderExample(VPInstruction *Store1, VPInstruction *Store2,
+                                VPBasicBlock *Body,
+                                VPInterleavedAccessInfo &&IAI) {
+  VPlanSlp Slp(IAI, *Body);
+  SmallVector<VPValue *, 4> StoreRoot = {Store1, Store2};
+  VPInstruction *CombinedStore = Slp.buildGraph(StoreRoot);
+
+  EXPECT_TRUE(Slp.isCompletelySLP());
+  EXPECT_EQ(CombinedStore->getOpcode(), VPInstruction::SLPStore);
+
+  VPInstruction *CombinedAdd =
+      cast<VPInstruction>(CombinedStore->getOperand(0));
+  EXPECT_EQ(CombinedAdd->getOpcode(), Instruction::Add);
+
+  VPInstruction *CombinedMulAB =
+      cast<VPInstruction>(CombinedAdd->getOperand(0));
+  VPInstruction *CombinedMulCD =
+      cast<VPInstruction>(CombinedAdd->getOperand(1));
+  EXPECT_EQ(CombinedMulAB->getOpcode(), Instruction::Mul);
+
+  VPInstruction *CombinedLoadA =
+      cast<VPInstruction>(CombinedMulAB->getOperand(0));
+  EXPECT_EQ(VPInstruction::SLPLoad, CombinedLoadA->getOpcode());
+  VPInstruction *LoadvA0 = cast<VPInstruction>(&*std::next(Body->begin(), 2));
+  VPInstruction *LoadvA1 = cast<VPInstruction>(&*std::next(Body->begin(), 12));
+  EXPECT_EQ(LoadvA0->getOperand(0), CombinedLoadA->getOperand(0));
+  EXPECT_EQ(LoadvA1->getOperand(0), CombinedLoadA->getOperand(1));
+
+  VPInstruction *CombinedLoadB =
+      cast<VPInstruction>(CombinedMulAB->getOperand(1));
+  EXPECT_EQ(VPInstruction::SLPLoad, CombinedLoadB->getOpcode());
+  VPInstruction *LoadvB0 = cast<VPInstruction>(&*std::next(Body->begin(), 4));
+  VPInstruction *LoadvB1 = cast<VPInstruction>(&*std::next(Body->begin(), 14));
+  EXPECT_EQ(LoadvB0->getOperand(0), CombinedLoadB->getOperand(0));
+  EXPECT_EQ(LoadvB1->getOperand(0), CombinedLoadB->getOperand(1));
+
+  EXPECT_EQ(CombinedMulCD->getOpcode(), Instruction::Mul);
+
+  VPInstruction *CombinedLoadC =
+      cast<VPInstruction>(CombinedMulCD->getOperand(0));
+  EXPECT_EQ(VPInstruction::SLPLoad, CombinedLoadC->getOpcode());
+  VPInstruction *LoadvC0 = cast<VPInstruction>(&*std::next(Body->begin(), 7));
+  VPInstruction *LoadvC1 = cast<VPInstruction>(&*std::next(Body->begin(), 17));
+  EXPECT_EQ(LoadvC0->getOperand(0), CombinedLoadC->getOperand(0));
+  EXPECT_EQ(LoadvC1->getOperand(0), CombinedLoadC->getOperand(1));
+
+  VPInstruction *CombinedLoadD =
+      cast<VPInstruction>(CombinedMulCD->getOperand(1));
+  EXPECT_EQ(VPInstruction::SLPLoad, CombinedLoadD->getOpcode());
+  VPInstruction *LoadvD0 = cast<VPInstruction>(&*std::next(Body->begin(), 9));
+  VPInstruction *LoadvD1 = cast<VPInstruction>(&*std::next(Body->begin(), 19));
+  EXPECT_EQ(LoadvD0->getOperand(0), CombinedLoadD->getOperand(0));
+  EXPECT_EQ(LoadvD1->getOperand(0), CombinedLoadD->getOperand(1));
+}
+
+TEST_F(VPlanSlpTest, testSlpReorder_1) {
+  LLVMContext Ctx;
+  const char *ModuleString =
+      "%struct.Test = type { i32, i32 }\n"
+      "define void @add_x3(%struct.Test* %A, %struct.Test* %B, %struct.Test* "
+      "%C,  %struct.Test* %D,  %struct.Test* %E)  {\n"
+      "entry:\n"
+      "  br label %for.body\n"
+      "for.body:                                         ; preds = %for.body, "
+      "%entry\n"
+      "  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n"
+      "  %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vA0 = load i32, i32* %A0, align 4\n"
+      "  %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vB0 = load i32, i32* %B0, align 4\n"
+      "  %mul11 = mul nsw i32 %vA0, %vB0\n"
+      "  %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vC0 = load i32, i32* %C0, align 4\n"
+      "  %D0 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vD0 = load i32, i32* %D0, align 4\n"
+      "  %mul12 = mul nsw i32 %vC0, %vD0\n"
+      "  %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vA1 = load i32, i32* %A1, align 4\n"
+      "  %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vB1 = load i32, i32* %B1, align 4\n"
+      "  %mul21 = mul nsw i32 %vA1, %vB1\n"
+      "  %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vC1 = load i32, i32* %C1, align 4\n"
+      "  %D1 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vD1 = load i32, i32* %D1, align 4\n"
+      "  %mul22 = mul nsw i32 %vC1, %vD1\n"
+      "  %add1 = add nsw i32 %mul11, %mul12\n"
+      "  %add2 = add nsw i32 %mul22, %mul21\n"
+      "  %E0 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 "
+      "%indvars.iv, i32 0\n"
+      "  store i32 %add1, i32* %E0, align 4\n"
+      "  %E1 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 "
+      "%indvars.iv, i32 1\n"
+      "  store i32 %add2, i32* %E1, align 4\n"
+      "  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n"
+      "  %exitcond = icmp eq i64 %indvars.iv.next, 1024\n"
+      "  br i1 %exitcond, label %for.cond.cleanup, label %for.body\n"
+      "for.cond.cleanup:                                 ; preds = %for.body\n"
+      "  ret void\n"
+      "}\n";
+
+  Module &M = parseModule(ModuleString);
+
+  Function *F = M.getFunction("add_x3");
+  BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor();
+  auto Plan = buildHCFG(LoopHeader);
+
+  VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock();
+  EXPECT_NE(nullptr, Entry->getSingleSuccessor());
+  VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock();
+
+  VPInstruction *Store1 = cast<VPInstruction>(&*std::next(Body->begin(), 24));
+  VPInstruction *Store2 = cast<VPInstruction>(&*std::next(Body->begin(), 26));
+
+  checkReorderExample(
+      Store1, Store2, Body,
+      getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan));
+}
+
+TEST_F(VPlanSlpTest, testSlpReorder_2) {
+  LLVMContext Ctx;
+  const char *ModuleString =
+      "%struct.Test = type { i32, i32 }\n"
+      "define void @add_x3(%struct.Test* %A, %struct.Test* %B, %struct.Test* "
+      "%C,  %struct.Test* %D,  %struct.Test* %E)  {\n"
+      "entry:\n"
+      "  br label %for.body\n"
+      "for.body:                                         ; preds = %for.body, "
+      "%entry\n"
+      "  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n"
+      "  %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vA0 = load i32, i32* %A0, align 4\n"
+      "  %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vB0 = load i32, i32* %B0, align 4\n"
+      "  %mul11 = mul nsw i32 %vA0, %vB0\n"
+      "  %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vC0 = load i32, i32* %C0, align 4\n"
+      "  %D0 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vD0 = load i32, i32* %D0, align 4\n"
+      "  %mul12 = mul nsw i32 %vC0, %vD0\n"
+      "  %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vA1 = load i32, i32* %A1, align 4\n"
+      "  %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vB1 = load i32, i32* %B1, align 4\n"
+      "  %mul21 = mul nsw i32 %vB1, %vA1\n"
+      "  %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vC1 = load i32, i32* %C1, align 4\n"
+      "  %D1 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vD1 = load i32, i32* %D1, align 4\n"
+      "  %mul22 = mul nsw i32 %vD1, %vC1\n"
+      "  %add1 = add nsw i32 %mul11, %mul12\n"
+      "  %add2 = add nsw i32 %mul22, %mul21\n"
+      "  %E0 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 "
+      "%indvars.iv, i32 0\n"
+      "  store i32 %add1, i32* %E0, align 4\n"
+      "  %E1 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 "
+      "%indvars.iv, i32 1\n"
+      "  store i32 %add2, i32* %E1, align 4\n"
+      "  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n"
+      "  %exitcond = icmp eq i64 %indvars.iv.next, 1024\n"
+      "  br i1 %exitcond, label %for.cond.cleanup, label %for.body\n"
+      "for.cond.cleanup:                                 ; preds = %for.body\n"
+      "  ret void\n"
+      "}\n";
+
+  Module &M = parseModule(ModuleString);
+
+  Function *F = M.getFunction("add_x3");
+  BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor();
+  auto Plan = buildHCFG(LoopHeader);
+
+  VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock();
+  EXPECT_NE(nullptr, Entry->getSingleSuccessor());
+  VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock();
+
+  VPInstruction *Store1 = cast<VPInstruction>(&*std::next(Body->begin(), 24));
+  VPInstruction *Store2 = cast<VPInstruction>(&*std::next(Body->begin(), 26));
+
+  checkReorderExample(
+      Store1, Store2, Body,
+      getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan));
+}
+
+TEST_F(VPlanSlpTest, testSlpReorder_3) {
+  LLVMContext Ctx;
+  const char *ModuleString =
+      "%struct.Test = type { i32, i32 }\n"
+      "define void @add_x3(%struct.Test* %A, %struct.Test* %B, %struct.Test* "
+      "%C,  %struct.Test* %D,  %struct.Test* %E)  {\n"
+      "entry:\n"
+      "  br label %for.body\n"
+      "for.body:                                         ; preds = %for.body, "
+      "%entry\n"
+      "  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n"
+      "  %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vA1 = load i32, i32* %A1, align 4\n"
+      "  %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vB0 = load i32, i32* %B0, align 4\n"
+      "  %mul11 = mul nsw i32 %vA1, %vB0\n"
+      "  %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vC0 = load i32, i32* %C0, align 4\n"
+      "  %D0 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vD0 = load i32, i32* %D0, align 4\n"
+      "  %mul12 = mul nsw i32 %vC0, %vD0\n"
+      "  %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vA0 = load i32, i32* %A0, align 4\n"
+      "  %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vB1 = load i32, i32* %B1, align 4\n"
+      "  %mul21 = mul nsw i32 %vB1, %vA0\n"
+      "  %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vC1 = load i32, i32* %C1, align 4\n"
+      "  %D1 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vD1 = load i32, i32* %D1, align 4\n"
+      "  %mul22 = mul nsw i32 %vD1, %vC1\n"
+      "  %add1 = add nsw i32 %mul11, %mul12\n"
+      "  %add2 = add nsw i32 %mul22, %mul21\n"
+      "  %E0 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 "
+      "%indvars.iv, i32 0\n"
+      "  store i32 %add1, i32* %E0, align 4\n"
+      "  %E1 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 "
+      "%indvars.iv, i32 1\n"
+      "  store i32 %add2, i32* %E1, align 4\n"
+      "  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n"
+      "  %exitcond = icmp eq i64 %indvars.iv.next, 1024\n"
+      "  br i1 %exitcond, label %for.cond.cleanup, label %for.body\n"
+      "for.cond.cleanup:                                 ; preds = %for.body\n"
+      "  ret void\n"
+      "}\n";
+
+  Module &M = parseModule(ModuleString);
+
+  Function *F = M.getFunction("add_x3");
+  BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor();
+  auto Plan = buildHCFG(LoopHeader);
+
+  VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock();
+  EXPECT_NE(nullptr, Entry->getSingleSuccessor());
+  VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock();
+
+  VPInstruction *Store1 = cast<VPInstruction>(&*std::next(Body->begin(), 24));
+  VPInstruction *Store2 = cast<VPInstruction>(&*std::next(Body->begin(), 26));
+
+  auto VPIAI = getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan);
+  VPlanSlp Slp(VPIAI, *Body);
+  SmallVector<VPValue *, 4> StoreRoot = {Store1, Store2};
+  EXPECT_EQ(nullptr, Slp.buildGraph(StoreRoot));
+
+  // FIXME Need to select better first value for lane0.
+  EXPECT_FALSE(Slp.isCompletelySLP());
+}
+
+TEST_F(VPlanSlpTest, testSlpReorder_4) {
+  LLVMContext Ctx;
+  const char *ModuleString =
+      "%struct.Test = type { i32, i32 }\n"
+      "define void @add_x3(%struct.Test* %A, %struct.Test* %B, %struct.Test* "
+      "%C,  %struct.Test* %D,  %struct.Test* %E)  {\n"
+      "entry:\n"
+      "  br label %for.body\n"
+      "for.body:                                         ; preds = %for.body, "
+      "%entry\n"
+      "  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n"
+      "  %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vA0 = load i32, i32* %A0, align 4\n"
+      "  %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vB0 = load i32, i32* %B0, align 4\n"
+      "  %mul11 = mul nsw i32 %vA0, %vB0\n"
+      "  %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vC0 = load i32, i32* %C0, align 4\n"
+      "  %D0 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vD0 = load i32, i32* %D0, align 4\n"
+      "  %mul12 = mul nsw i32 %vC0, %vD0\n"
+      "  %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vA1 = load i32, i32* %A1, align 4\n"
+      "  %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vB1 = load i32, i32* %B1, align 4\n"
+      "  %mul21 = mul nsw i32 %vA1, %vB1\n"
+      "  %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vC1 = load i32, i32* %C1, align 4\n"
+      "  %D1 = getelementptr inbounds %struct.Test, %struct.Test* %D, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vD1 = load i32, i32* %D1, align 4\n"
+      "  %mul22 = mul nsw i32 %vC1, %vD1\n"
+      "  %add1 = add nsw i32 %mul11, %mul12\n"
+      "  %add2 = add nsw i32 %mul22, %mul21\n"
+      "  %E0 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 "
+      "%indvars.iv, i32 0\n"
+      "  store i32 %add1, i32* %E0, align 4\n"
+      "  %E1 = getelementptr inbounds %struct.Test, %struct.Test* %E, i64 "
+      "%indvars.iv, i32 1\n"
+      "  store i32 %add2, i32* %E1, align 4\n"
+      "  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n"
+      "  %exitcond = icmp eq i64 %indvars.iv.next, 1024\n"
+      "  br i1 %exitcond, label %for.cond.cleanup, label %for.body\n"
+      "for.cond.cleanup:                                 ; preds = %for.body\n"
+      "  ret void\n"
+      "}\n";
+
+  Module &M = parseModule(ModuleString);
+
+  Function *F = M.getFunction("add_x3");
+  BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor();
+  auto Plan = buildHCFG(LoopHeader);
+
+  VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock();
+  EXPECT_NE(nullptr, Entry->getSingleSuccessor());
+  VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock();
+
+  VPInstruction *Store1 = cast<VPInstruction>(&*std::next(Body->begin(), 24));
+  VPInstruction *Store2 = cast<VPInstruction>(&*std::next(Body->begin(), 26));
+
+  checkReorderExample(
+      Store1, Store2, Body,
+      getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan));
+}
+
+// Make sure we do not combine instructions with operands in different BBs.
+TEST_F(VPlanSlpTest, testInstrsInDifferentBBs) {
+  const char *ModuleString =
+      "%struct.Test = type { i32, i32 }\n"
+      "%struct.Test3 = type { i32, i32, i32 }\n"
+      "%struct.Test4xi8 = type { i8, i8, i8 }\n"
+      "define void @add_x2(%struct.Test* nocapture readonly %A, %struct.Test* "
+      "nocapture readonly %B, %struct.Test* nocapture %C)  {\n"
+      "entry:\n"
+      "  br label %for.body\n"
+      "for.body:                                         ; preds = %for.body, "
+      "%entry\n"
+      "  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n"
+      "  %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vA0 = load i32, i32* %A0, align 4\n"
+      "  %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vB0 = load i32, i32* %B0, align 4\n"
+      "  %add0 = add nsw i32 %vA0, %vB0\n"
+      "  %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vA1 = load i32, i32* %A1, align 4\n"
+      "  %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 "
+      "%indvars.iv, i32 1\n"
+      "  br label %bb2\n"
+      "bb2:\n"
+      "  %vB1 = load i32, i32* %B1, align 4\n"
+      "  %add1 = add nsw i32 %vA1, %vB1\n"
+      "  %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 0\n"
+      "  store i32 %add0, i32* %C0, align 4\n"
+      "  %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 1\n"
+      "  store i32 %add1, i32* %C1, align 4\n"
+      "  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n"
+      "  %exitcond = icmp eq i64 %indvars.iv.next, 1024\n"
+      "  br i1 %exitcond, label %for.cond.cleanup, label %for.body\n"
+      "for.cond.cleanup:                                 ; preds = %for.body\n"
+      "  ret void\n"
+      "}\n";
+
+  Module &M = parseModule(ModuleString);
+
+  Function *F = M.getFunction("add_x2");
+  BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor();
+  auto Plan = buildHCFG(LoopHeader);
+  auto VPIAI = getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan);
+
+  VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock();
+  EXPECT_NE(nullptr, Entry->getSingleSuccessor());
+  VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock();
+  VPBasicBlock *BB2 = Body->getSingleSuccessor()->getEntryBasicBlock();
+
+  VPInstruction *Store1 = cast<VPInstruction>(&*std::next(BB2->begin(), 3));
+  VPInstruction *Store2 = cast<VPInstruction>(&*std::next(BB2->begin(), 5));
+
+  VPlanSlp Slp(VPIAI, *BB2);
+  SmallVector<VPValue *, 4> StoreRoot = {Store1, Store2};
+  EXPECT_EQ(nullptr, Slp.buildGraph(StoreRoot));
+  EXPECT_EQ(0u, Slp.getWidestBundleBits());
+}
+
+// Make sure we do not combine instructions with operands in different BBs.
+TEST_F(VPlanSlpTest, testInstrsInDifferentBBs2) {
+  const char *ModuleString =
+      "%struct.Test = type { i32, i32 }\n"
+      "%struct.Test3 = type { i32, i32, i32 }\n"
+      "%struct.Test4xi8 = type { i8, i8, i8 }\n"
+      "define void @add_x2(%struct.Test* nocapture readonly %A, %struct.Test* "
+      "nocapture readonly %B, %struct.Test* nocapture %C)  {\n"
+      "entry:\n"
+      "  br label %for.body\n"
+      "for.body:                                         ; preds = %for.body, "
+      "%entry\n"
+      "  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n"
+      "  %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vA0 = load i32, i32* %A0, align 4\n"
+      "  %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vB0 = load i32, i32* %B0, align 4\n"
+      "  %add0 = add nsw i32 %vA0, %vB0\n"
+      "  %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vA1 = load i32, i32* %A1, align 4\n"
+      "  %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vB1 = load i32, i32* %B1, align 4\n"
+      "  %add1 = add nsw i32 %vA1, %vB1\n"
+      "  br label %bb2\n"
+      "bb2:\n"
+      "  %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 0\n"
+      "  store i32 %add0, i32* %C0, align 4\n"
+      "  %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 1\n"
+      "  store i32 %add1, i32* %C1, align 4\n"
+      "  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n"
+      "  %exitcond = icmp eq i64 %indvars.iv.next, 1024\n"
+      "  br i1 %exitcond, label %for.cond.cleanup, label %for.body\n"
+      "for.cond.cleanup:                                 ; preds = %for.body\n"
+      "  ret void\n"
+      "}\n";
+
+  Module &M = parseModule(ModuleString);
+
+  Function *F = M.getFunction("add_x2");
+  BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor();
+  auto Plan = buildHCFG(LoopHeader);
+  auto VPIAI = getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan);
+
+  VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock();
+  EXPECT_NE(nullptr, Entry->getSingleSuccessor());
+  VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock();
+  VPBasicBlock *BB2 = Body->getSingleSuccessor()->getEntryBasicBlock();
+
+  VPInstruction *Store1 = cast<VPInstruction>(&*std::next(BB2->begin(), 1));
+  VPInstruction *Store2 = cast<VPInstruction>(&*std::next(BB2->begin(), 3));
+
+  VPlanSlp Slp(VPIAI, *BB2);
+  SmallVector<VPValue *, 4> StoreRoot = {Store1, Store2};
+  EXPECT_EQ(nullptr, Slp.buildGraph(StoreRoot));
+  EXPECT_EQ(0u, Slp.getWidestBundleBits());
+}
+
+TEST_F(VPlanSlpTest, testSlpAtomicLoad) {
+  const char *ModuleString =
+      "%struct.Test = type { i32, i32 }\n"
+      "%struct.Test3 = type { i32, i32, i32 }\n"
+      "%struct.Test4xi8 = type { i8, i8, i8 }\n"
+      "define void @add_x2(%struct.Test* nocapture readonly %A, %struct.Test* "
+      "nocapture readonly %B, %struct.Test* nocapture %C)  {\n"
+      "entry:\n"
+      "  br label %for.body\n"
+      "for.body:                                         ; preds = %for.body, "
+      "%entry\n"
+      "  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n"
+      "  %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vA0 = load atomic i32, i32* %A0 monotonic, align 4\n"
+      "  %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vB0 = load i32, i32* %B0, align 4\n"
+      "  %add0 = add nsw i32 %vA0, %vB0\n"
+      "  %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vA1 = load i32, i32* %A1, align 4\n"
+      "  %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vB1 = load i32, i32* %B1, align 4\n"
+      "  %add1 = add nsw i32 %vA1, %vB1\n"
+      "  %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 0\n"
+      "  store i32 %add0, i32* %C0, align 4\n"
+      "  %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 1\n"
+      "  store i32 %add1, i32* %C1, align 4\n"
+      "  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n"
+      "  %exitcond = icmp eq i64 %indvars.iv.next, 1024\n"
+      "  br i1 %exitcond, label %for.cond.cleanup, label %for.body\n"
+      "for.cond.cleanup:                                 ; preds = %for.body\n"
+      "  ret void\n"
+      "}\n";
+
+  Module &M = parseModule(ModuleString);
+
+  Function *F = M.getFunction("add_x2");
+  BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor();
+  auto Plan = buildHCFG(LoopHeader);
+  auto VPIAI = getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan);
+
+  VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock();
+  EXPECT_NE(nullptr, Entry->getSingleSuccessor());
+  VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock();
+
+  VPInstruction *Store1 = cast<VPInstruction>(&*std::next(Body->begin(), 12));
+  VPInstruction *Store2 = cast<VPInstruction>(&*std::next(Body->begin(), 14));
+
+  VPlanSlp Slp(VPIAI, *Body);
+  SmallVector<VPValue *, 4> StoreRoot = {Store1, Store2};
+  EXPECT_EQ(nullptr, Slp.buildGraph(StoreRoot));
+  EXPECT_FALSE(Slp.isCompletelySLP());
+}
+
+TEST_F(VPlanSlpTest, testSlpAtomicStore) {
+  const char *ModuleString =
+      "%struct.Test = type { i32, i32 }\n"
+      "%struct.Test3 = type { i32, i32, i32 }\n"
+      "%struct.Test4xi8 = type { i8, i8, i8 }\n"
+      "define void @add_x2(%struct.Test* nocapture readonly %A, %struct.Test* "
+      "nocapture readonly %B, %struct.Test* nocapture %C)  {\n"
+      "entry:\n"
+      "  br label %for.body\n"
+      "for.body:                                         ; preds = %for.body, "
+      "%entry\n"
+      "  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]\n"
+      "  %A0 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vA0 = load i32, i32* %A0, align 4\n"
+      "  %B0 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 "
+      "%indvars.iv, i32 0\n"
+      "  %vB0 = load i32, i32* %B0, align 4\n"
+      "  %add0 = add nsw i32 %vA0, %vB0\n"
+      "  %A1 = getelementptr inbounds %struct.Test, %struct.Test* %A, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vA1 = load i32, i32* %A1, align 4\n"
+      "  %B1 = getelementptr inbounds %struct.Test, %struct.Test* %B, i64 "
+      "%indvars.iv, i32 1\n"
+      "  %vB1 = load i32, i32* %B1, align 4\n"
+      "  %add1 = add nsw i32 %vA1, %vB1\n"
+      "  %C0 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 0\n"
+      "  store atomic i32 %add0, i32* %C0 monotonic, align 4\n"
+      "  %C1 = getelementptr inbounds %struct.Test, %struct.Test* %C, i64 "
+      "%indvars.iv, i32 1\n"
+      "  store i32 %add1, i32* %C1, align 4\n"
+      "  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1\n"
+      "  %exitcond = icmp eq i64 %indvars.iv.next, 1024\n"
+      "  br i1 %exitcond, label %for.cond.cleanup, label %for.body\n"
+      "for.cond.cleanup:                                 ; preds = %for.body\n"
+      "  ret void\n"
+      "}\n";
+
+  Module &M = parseModule(ModuleString);
+
+  Function *F = M.getFunction("add_x2");
+  BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor();
+  auto Plan = buildHCFG(LoopHeader);
+  auto VPIAI = getInterleavedAccessInfo(*F, LI->getLoopFor(LoopHeader), *Plan);
+
+  VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock();
+  EXPECT_NE(nullptr, Entry->getSingleSuccessor());
+  VPBasicBlock *Body = Entry->getSingleSuccessor()->getEntryBasicBlock();
+
+  VPInstruction *Store1 = cast<VPInstruction>(&*std::next(Body->begin(), 12));
+  VPInstruction *Store2 = cast<VPInstruction>(&*std::next(Body->begin(), 14));
+
+  VPlanSlp Slp(VPIAI, *Body);
+  SmallVector<VPValue *, 4> StoreRoot = {Store1, Store2};
+  Slp.buildGraph(StoreRoot);
+  EXPECT_FALSE(Slp.isCompletelySLP());
+}
+
+} // namespace
+} // namespace llvm
-- 
2.7.4