From 81dbb6aec6261a06132f17b06c9071f4a1871fa4 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Tue, 11 Feb 2020 18:27:41 +0000
Subject: [PATCH] Recommit "[DSE] Add first version of MemorySSA-backed DSE
 (Bottom up walk)."

This includes a fix for the santizier failures.

This reverts the revert commit
42f8b915eb72364cc5e84adf58a2c2d4947e8b10.
---
 .../lib/Transforms/Scalar/DeadStoreElimination.cpp | 538 ++++++++++++++++++++-
 .../MSSA/2011-09-06-EndOfFunction.ll               |   1 +
 .../MSSA/OverwriteStoreBegin.ll                    |   1 +
 .../DeadStoreElimination/MSSA/OverwriteStoreEnd.ll |   1 +
 .../Transforms/DeadStoreElimination/MSSA/atomic.ll |   1 +
 .../DeadStoreElimination/MSSA/calloc-store.ll      |   2 +
 .../DeadStoreElimination/MSSA/fence-todo.ll        |  50 ++
 .../Transforms/DeadStoreElimination/MSSA/fence.ll  |  48 --
 .../Transforms/DeadStoreElimination/MSSA/free.ll   |   2 +
 .../DeadStoreElimination/MSSA/inst-limits.ll       |   9 +-
 .../DeadStoreElimination/MSSA/lifetime.ll          |   2 +
 .../MSSA/mda-with-dbg-values.ll                    |  20 +-
 .../MSSA/memcpy-complete-overwrite.ll              |   2 +
 .../DeadStoreElimination/MSSA/memintrinsics.ll     |   1 +
 .../MSSA/memoryssa-scan-limit.ll                   |  72 +++
 .../DeadStoreElimination/MSSA/memset-and-memcpy.ll |   9 +-
 .../MSSA/memset-missing-debugloc.ll                |   1 +
 .../MSSA/merge-stores-big-endian.ll                |   1 +
 .../DeadStoreElimination/MSSA/merge-stores.ll      |   1 +
 .../MSSA/multiblock-captures.ll                    |   7 +-
 .../MSSA/multiblock-exceptions.ll                  |   1 +
 .../DeadStoreElimination/MSSA/multiblock-loops.ll  | 114 ++++-
 .../MSSA/multiblock-memoryphis.ll                  |  70 +++
 .../MSSA/multiblock-partial.ll                     |   3 +-
 .../DeadStoreElimination/MSSA/multiblock-simple.ll |  41 +-
 .../DeadStoreElimination/MSSA/operand-bundles.ll   |   1 +
 .../DeadStoreElimination/MSSA/simple-todo.ll       | 159 +-----
 .../Transforms/DeadStoreElimination/MSSA/simple.ll | 167 ++++++-
 28 files changed, 1066 insertions(+), 259 deletions(-)
 create mode 100644 llvm/test/Transforms/DeadStoreElimination/MSSA/fence-todo.ll
 create mode 100644 llvm/test/Transforms/DeadStoreElimination/MSSA/memoryssa-scan-limit.ll
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index f56a4c5..7106fb0 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -29,7 +29,10 @@
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
 #include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/OrderedBasicBlock.h"
+#include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Argument.h"
@@ -40,6 +43,7 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
@@ -87,11 +91,20 @@ EnablePartialStoreMerging("enable-dse-partial-store-merging",
   cl::init(true), cl::Hidden,
   cl::desc("Enable partial store merging in DSE"));
 
-// Temporary dummy option for tests.
 static cl::opt<bool>
     EnableMemorySSA("enable-dse-memoryssa", cl::init(false), cl::Hidden,
                     cl::desc("Use the new MemorySSA-backed DSE."));
 
+static cl::opt<unsigned>
+    MemorySSAScanLimit("dse-memoryssa-scanlimit", cl::init(100), cl::Hidden,
+                       cl::desc("The number of memory instructions to scan for "
+                                "dead store elimination (default = 100)"));
+
+static cl::opt<unsigned> MemorySSADefsPerBlockLimit(
+    "dse-memoryssa-defs-per-block-limit", cl::init(5000), cl::Hidden,
+    cl::desc("The number of MemoryDefs we consider as candidates to eliminated "
+             "other stores per basic block (default = 5000)"));
+
 //===----------------------------------------------------------------------===//
 // Helper functions
 //===----------------------------------------------------------------------===//
@@ -1354,22 +1367,490 @@ static bool eliminateDeadStores(Function &F, AliasAnalysis *AA,
   return MadeChange;
 }
 
+namespace {
+//=============================================================================
+// MemorySSA backed dead store elimination.
+//
+// The code below implements dead store elimination using MemorySSA. It uses
+// the following general approach: given a MemoryDef, walk upwards to find
+// clobbering MemoryDefs that may be killed by the starting def. Then check
+// that there are no uses that may read the location of the original MemoryDef
+// in between both MemoryDefs. A bit more concretely:
+//
+// For all MemoryDefs StartDef:
+// 1. Get the next dominating clobbering MemoryDef (DomAccess) by walking
+//    upwards.
+// 2. Check that there are no reads between DomAccess and the StartDef by
+//    checking all uses starting at DomAccess and walking until we see StartDef.
+// 3. For each found DomDef, check that:
+//   1. There are no barrier instructions between DomDef and StartDef (like
+//       throws or stores with ordering constraints).
+//   2. StartDef is executed whenever DomDef is executed.
+//   3. StartDef completely overwrites DomDef.
+// 4. Erase DomDef from the function and MemorySSA.
+
+// Returns true if \p M is an intrisnic that does not read or write memory.
+bool isNoopIntrinsic(MemoryUseOrDef *M) {
+  if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(M->getMemoryInst())) {
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::lifetime_start:
+    case Intrinsic::lifetime_end:
+    case Intrinsic::invariant_end:
+    case Intrinsic::launder_invariant_group:
+    case Intrinsic::assume:
+      return true;
+    case Intrinsic::dbg_addr:
+    case Intrinsic::dbg_declare:
+    case Intrinsic::dbg_label:
+    case Intrinsic::dbg_value:
+      llvm_unreachable("Intrinsic should not be modeled in MemorySSA");
+    default:
+      return false;
+    }
+  }
+  return false;
+}
+
+// Check if we can ignore \p D for DSE.
+bool canSkipDef(MemoryDef *D, bool DefVisibleToCaller) {
+  Instruction *DI = D->getMemoryInst();
+  // Calls that only access inaccessible memory cannot read or write any memory
+  // locations we consider for elimination.
+  if (auto CS = CallSite(DI))
+    if (CS.onlyAccessesInaccessibleMemory())
+      return true;
+
+  // We can eliminate stores to locations not visible to the caller across
+  // throwing instructions.
+  if (DI->mayThrow() && !DefVisibleToCaller)
+    return true;
+
+  // We can remove the dead stores, irrespective of the fence and its ordering
+  // (release/acquire/seq_cst). Fences only constraints the ordering of
+  // already visible stores, it does not make a store visible to other
+  // threads. So, skipping over a fence does not change a store from being
+  // dead.
+  if (isa<FenceInst>(DI))
+    return true;
+
+  // Skip intrinsics that do not really read or modify memory.
+  if (isNoopIntrinsic(D))
+    return true;
+
+  return false;
+}
+
+struct DSEState {
+  Function &F;
+  AliasAnalysis &AA;
+  MemorySSA &MSSA;
+  DominatorTree &DT;
+  PostDominatorTree &PDT;
+  const TargetLibraryInfo &TLI;
+
+  // All MemoryDefs that potentially could kill other MemDefs.
+  SmallVector<MemoryDef *, 64> MemDefs;
+  // Any that should be skipped as they are already deleted
+  SmallPtrSet<MemoryAccess *, 4> SkipStores;
+  // Keep track of all of the objects that are invisible to the caller until the
+  // function returns.
+  SmallPtrSet<const Value *, 16> InvisibleToCaller;
+  // Keep track of blocks with throwing instructions not modeled in MemorySSA.
+  SmallPtrSet<BasicBlock *, 16> ThrowingBlocks;
+
+  DSEState(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT,
+           PostDominatorTree &PDT, const TargetLibraryInfo &TLI)
+      : F(F), AA(AA), MSSA(MSSA), DT(DT), PDT(PDT), TLI(TLI) {}
+
+  static DSEState get(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
+                      DominatorTree &DT, PostDominatorTree &PDT,
+                      const TargetLibraryInfo &TLI) {
+    DSEState State(F, AA, MSSA, DT, PDT, TLI);
+    // Collect blocks with throwing instructions not modeled in MemorySSA and
+    // alloc-like objects.
+    for (Instruction &I : instructions(F)) {
+      if (I.mayThrow() && !MSSA.getMemoryAccess(&I))
+        State.ThrowingBlocks.insert(I.getParent());
+
+      auto *MD = dyn_cast_or_null<MemoryDef>(MSSA.getMemoryAccess(&I));
+      if (MD && State.MemDefs.size() < MemorySSADefsPerBlockLimit &&
+          hasAnalyzableMemoryWrite(&I, TLI) && isRemovable(&I))
+        State.MemDefs.push_back(MD);
+
+      // Track alloca and alloca-like objects. Here we care about objects not
+      // visible to the caller during function execution. Alloca objects are
+      // invalid in the caller, for alloca-like objects we ensure that they are
+      // not captured throughout the function.
+      if (isa<AllocaInst>(&I) ||
+          (isAllocLikeFn(&I, &TLI) && !PointerMayBeCaptured(&I, false, true)))
+        State.InvisibleToCaller.insert(&I);
+    }
+    // Treat byval or inalloca arguments the same as Allocas, stores to them are
+    // dead at the end of the function.
+    for (Argument &AI : F.args())
+      if (AI.hasByValOrInAllocaAttr())
+        State.InvisibleToCaller.insert(&AI);
+    return State;
+  }
+
+  Optional<MemoryLocation> getLocForWriteEx(Instruction *I) const {
+    if (!I->mayWriteToMemory())
+      return None;
+
+    if (auto *MTI = dyn_cast<AnyMemIntrinsic>(I))
+      return {MemoryLocation::getForDest(MTI)};
+
+    if (auto CS = CallSite(I)) {
+      if (Function *F = CS.getCalledFunction()) {
+        StringRef FnName = F->getName();
+        if (TLI.has(LibFunc_strcpy) && FnName == TLI.getName(LibFunc_strcpy))
+          return {MemoryLocation(CS.getArgument(0))};
+        if (TLI.has(LibFunc_strncpy) && FnName == TLI.getName(LibFunc_strncpy))
+          return {MemoryLocation(CS.getArgument(0))};
+        if (TLI.has(LibFunc_strcat) && FnName == TLI.getName(LibFunc_strcat))
+          return {MemoryLocation(CS.getArgument(0))};
+        if (TLI.has(LibFunc_strncat) && FnName == TLI.getName(LibFunc_strncat))
+          return {MemoryLocation(CS.getArgument(0))};
+      }
+      return None;
+    }
+
+    return MemoryLocation::getOrNone(I);
+  }
+
+  /// Returns true if \p Use completely overwrites \p DefLoc.
+  bool isCompleteOverwrite(MemoryLocation DefLoc, Instruction *UseInst) const {
+    // UseInst has a MemoryDef associated in MemorySSA. It's possible for a
+    // MemoryDef to not write to memory, e.g. a volatile load is modeled as a
+    // MemoryDef.
+    if (!UseInst->mayWriteToMemory())
+      return false;
+
+    if (auto CS = CallSite(UseInst))
+      if (CS.onlyAccessesInaccessibleMemory())
+        return false;
+
+    ModRefInfo MR = AA.getModRefInfo(UseInst, DefLoc);
+    // If necessary, perform additional analysis.
+    if (isModSet(MR))
+      MR = AA.callCapturesBefore(UseInst, DefLoc, &DT);
+
+    Optional<MemoryLocation> UseLoc = getLocForWriteEx(UseInst);
+    return isModSet(MR) && isMustSet(MR) &&
+           UseLoc->Size.getValue() >= DefLoc.Size.getValue();
+  }
+
+  /// Returns true if \p Use may read from \p DefLoc.
+  bool isReadClobber(MemoryLocation DefLoc, Instruction *UseInst) const {
+    if (!UseInst->mayReadFromMemory())
+      return false;
+
+    if (auto CS = CallSite(UseInst))
+      if (CS.onlyAccessesInaccessibleMemory())
+        return false;
+
+    ModRefInfo MR = AA.getModRefInfo(UseInst, DefLoc);
+    // If necessary, perform additional analysis.
+    if (isRefSet(MR))
+      MR = AA.callCapturesBefore(UseInst, DefLoc, &DT);
+    return isRefSet(MR);
+  }
+
+  // Find a MemoryDef writing to \p DefLoc and dominating \p Current, with no
+  // read access in between or return None otherwise. The returned value may not
+  // (completely) overwrite \p DefLoc. Currently we bail out when we encounter
+  // any of the following
+  //  * An aliasing MemoryUse (read).
+  //  * A MemoryPHI.
+  Optional<MemoryAccess *> getDomMemoryDef(MemoryDef *KillingDef,
+                                           MemoryAccess *Current,
+                                           MemoryLocation DefLoc,
+                                           bool DefVisibleToCaller,
+                                           int &ScanLimit) const {
+    MemoryDef *DomDef;
+    MemoryAccess *StartDef = Current;
+    bool StepAgain;
+    LLVM_DEBUG(dbgs() << "  trying to get dominating access for " << *Current
+                      << "\n");
+    // Find the next clobbering Mod access for DefLoc, starting at Current.
+    do {
+      StepAgain = false;
+      // Reached TOP.
+      if (MSSA.isLiveOnEntryDef(Current))
+        return None;
+
+      MemoryUseOrDef *CurrentUD = dyn_cast<MemoryUseOrDef>(Current);
+      if (!CurrentUD)
+        return None;
+
+      // Look for access that clobber DefLoc.
+      MemoryAccess *DomAccess =
+          MSSA.getSkipSelfWalker()->getClobberingMemoryAccess(
+              CurrentUD->getDefiningAccess(), DefLoc);
+      DomDef = dyn_cast<MemoryDef>(DomAccess);
+      if (!DomDef || MSSA.isLiveOnEntryDef(DomDef))
+        return None;
+
+      // Check if we can skip DomDef for DSE. We also require the KillingDef
+      // execute whenever DomDef executes and use post-dominance to ensure that.
+      if (canSkipDef(DomDef, DefVisibleToCaller) ||
+          !PDT.dominates(KillingDef->getBlock(), DomDef->getBlock())) {
+        StepAgain = true;
+        Current = DomDef;
+      }
+
+    } while (StepAgain);
+
+    LLVM_DEBUG(dbgs() << "  Checking for reads of " << *DomDef << " ("
+                      << *DomDef->getMemoryInst() << ")\n");
+
+    SmallSetVector<MemoryAccess *, 32> WorkList;
+    auto PushMemUses = [&WorkList](MemoryAccess *Acc) {
+      for (Use &U : Acc->uses())
+        WorkList.insert(cast<MemoryAccess>(U.getUser()));
+    };
+    PushMemUses(DomDef);
+
+    // Check if DomDef may be read.
+    for (unsigned I = 0; I < WorkList.size(); I++) {
+      MemoryAccess *UseAccess = WorkList[I];
+
+      LLVM_DEBUG(dbgs() << "   Checking use " << *UseAccess);
+      if (--ScanLimit == 0) {
+        LLVM_DEBUG(dbgs() << "  ...  hit scan limit\n");
+        return None;
+      }
+
+      // Bail out on MemoryPhis for now.
+      if (isa<MemoryPhi>(UseAccess)) {
+        LLVM_DEBUG(dbgs() << "  ...  hit MemoryPhi\n");
+        return None;
+      }
+
+      Instruction *UseInst = cast<MemoryUseOrDef>(UseAccess)->getMemoryInst();
+      LLVM_DEBUG(dbgs() << " (" << *UseInst << ")\n");
+
+      if (isNoopIntrinsic(cast<MemoryUseOrDef>(UseAccess))) {
+        PushMemUses(UseAccess);
+        continue;
+      }
+
+      // Uses which may read the original MemoryDef mean we cannot eliminate the
+      // original MD. Stop walk.
+      if (isReadClobber(DefLoc, UseInst)) {
+        LLVM_DEBUG(dbgs() << "  ... found read clobber\n");
+        return None;
+      }
+
+      if (StartDef == UseAccess)
+        continue;
+
+      // Check all uses for MemoryDefs, except for defs completely overwriting
+      // the original location. Otherwise we have to check uses of *all*
+      // MemoryDefs we discover, including non-aliasing ones. Otherwise we might
+      // miss cases like the following
+      //   1 = Def(LoE) ; <----- DomDef stores [0,1]
+      //   2 = Def(1)   ; (2, 1) = NoAlias,   stores [2,3]
+      //   Use(2)       ; MayAlias 2 *and* 1, loads [0, 3].
+      //                  (The Use points to the *first* Def it may alias)
+      //   3 = Def(1)   ; <---- Current  (3, 2) = NoAlias, (3,1) = MayAlias,
+      //                  stores [0,1]
+      if (MemoryDef *UseDef = dyn_cast<MemoryDef>(UseAccess)) {
+        if (!isCompleteOverwrite(DefLoc, UseInst))
+          PushMemUses(UseDef);
+      }
+    }
+
+    // No aliasing MemoryUses of DomDef found, DomDef is potentially dead.
+    return {DomDef};
+  }
+
+  // Delete dead memory defs
+  void deleteDeadInstruction(Instruction *SI) {
+    MemorySSAUpdater Updater(&MSSA);
+    SmallVector<Instruction *, 32> NowDeadInsts;
+    NowDeadInsts.push_back(SI);
+    --NumFastOther;
+
+    while (!NowDeadInsts.empty()) {
+      Instruction *DeadInst = NowDeadInsts.pop_back_val();
+      ++NumFastOther;
+
+      // Try to preserve debug information attached to the dead instruction.
+      salvageDebugInfo(*DeadInst);
+
+      // Remove the Instruction from MSSA.
+      if (MemoryAccess *MA = MSSA.getMemoryAccess(DeadInst)) {
+        if (MemoryDef *MD = dyn_cast<MemoryDef>(MA)) {
+          SkipStores.insert(MD);
+        }
+        Updater.removeMemoryAccess(MA);
+      }
+
+      // Remove its operands
+      for (Use &O : DeadInst->operands())
+        if (Instruction *OpI = dyn_cast<Instruction>(O)) {
+          O = nullptr;
+          if (isInstructionTriviallyDead(OpI, &TLI))
+            NowDeadInsts.push_back(OpI);
+        }
+
+      DeadInst->eraseFromParent();
+    }
+  }
+
+  // Check for any extra throws between SI and NI that block DSE.  This only
+  // checks extra maythrows (those that aren't MemoryDef's). MemoryDef that may
+  // throw are handled during the walk from one def to the next.
+  bool mayThrowBetween(Instruction *SI, Instruction *NI,
+                       const Value *SILocUnd) const {
+    // First see if we can ignore it by using the fact that SI is an
+    // alloca/alloca like object that is not visible to the caller during
+    // execution of the function.
+    if (SILocUnd && InvisibleToCaller.count(SILocUnd))
+      return false;
+
+    if (SI->getParent() == NI->getParent())
+      return ThrowingBlocks.find(SI->getParent()) != ThrowingBlocks.end();
+    return !ThrowingBlocks.empty();
+  }
+
+  // Check if \p NI acts as a DSE barrier for \p SI. The following instructions
+  // act as barriers:
+  //  * A memory instruction that may throw and \p SI accesses a non-stack
+  //  object.
+  //  * Atomic stores stronger that monotonic.
+  bool isDSEBarrier(Instruction *SI, MemoryLocation &SILoc,
+                    const Value *SILocUnd, Instruction *NI,
+                    MemoryLocation &NILoc) const {
+    // If NI may throw it acts as a barrier, unless we are to an alloca/alloca
+    // like object that does not escape.
+    if (NI->mayThrow() && !InvisibleToCaller.count(SILocUnd))
+      return true;
+
+    if (NI->isAtomic()) {
+      if (auto *NSI = dyn_cast<StoreInst>(NI)) {
+        if (isStrongerThanMonotonic(NSI->getOrdering()))
+          return true;
+      } else
+        llvm_unreachable(
+            "Other instructions should be modeled/skipped in MemorySSA");
+    }
+
+    return false;
+  }
+};
+
+bool eliminateDeadStoresMemorySSA(Function &F, AliasAnalysis &AA,
+                                  MemorySSA &MSSA, DominatorTree &DT,
+                                  PostDominatorTree &PDT,
+                                  const TargetLibraryInfo &TLI) {
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  bool MadeChange = false;
+
+  DSEState State = DSEState::get(F, AA, MSSA, DT, PDT, TLI);
+  // For each store:
+  for (unsigned I = 0; I < State.MemDefs.size(); I++) {
+    MemoryDef *Current = State.MemDefs[I];
+    if (State.SkipStores.count(Current))
+      continue;
+    Instruction *SI = cast<MemoryDef>(Current)->getMemoryInst();
+    auto MaybeSILoc = State.getLocForWriteEx(SI);
+    if (!MaybeSILoc) {
+      LLVM_DEBUG(dbgs() << "Failed to find analyzable write location for "
+                        << *SI << "\n");
+      continue;
+    }
+    MemoryLocation SILoc = *MaybeSILoc;
+    assert(SILoc.Ptr && "SILoc should not be null");
+    const Value *SILocUnd = GetUnderlyingObject(SILoc.Ptr, DL);
+    Instruction *DefObj =
+        const_cast<Instruction *>(dyn_cast<Instruction>(SILocUnd));
+    bool DefVisibleToCaller = !State.InvisibleToCaller.count(SILocUnd);
+    if (DefObj && ((isAllocLikeFn(DefObj, &TLI) &&
+                    !PointerMayBeCapturedBefore(DefObj, false, true, SI, &DT))))
+      DefVisibleToCaller = false;
+
+    LLVM_DEBUG(dbgs() << "Trying to eliminate MemoryDefs killed by " << *SI
+                      << "\n");
+
+    int ScanLimit = MemorySSAScanLimit;
+    MemoryDef *StartDef = Current;
+    // Walk MemorySSA upward to find MemoryDefs that might be killed by SI.
+    while (Optional<MemoryAccess *> Next = State.getDomMemoryDef(
+               StartDef, Current, SILoc, DefVisibleToCaller, ScanLimit)) {
+      MemoryAccess *DomAccess = *Next;
+      LLVM_DEBUG(dbgs() << " Checking if we can kill " << *DomAccess << "\n");
+      MemoryDef *NextDef = dyn_cast<MemoryDef>(DomAccess);
+      Instruction *NI = NextDef->getMemoryInst();
+      LLVM_DEBUG(dbgs() << "  def " << *NI << "\n");
+
+      if (!hasAnalyzableMemoryWrite(NI, TLI))
+        break;
+      MemoryLocation NILoc = *State.getLocForWriteEx(NI);
+      // Check for anything that looks like it will be a barrier to further
+      // removal
+      if (State.isDSEBarrier(SI, SILoc, SILocUnd, NI, NILoc)) {
+        LLVM_DEBUG(dbgs() << "  stop, barrier\n");
+        break;
+      }
+
+      // Before we try to remove anything, check for any extra throwing
+      // instructions that block us from DSEing
+      if (State.mayThrowBetween(SI, NI, SILocUnd)) {
+        LLVM_DEBUG(dbgs() << " stop, may throw!\n");
+        break;
+      }
+
+      // Check if NI overwrites SI.
+      int64_t InstWriteOffset, DepWriteOffset;
+      InstOverlapIntervalsTy IOL;
+      OverwriteResult OR = isOverwrite(SILoc, NILoc, DL, TLI, DepWriteOffset,
+                                       InstWriteOffset, NI, IOL, AA, &F);
+
+      if (OR == OW_Complete) {
+        LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: " << *NI
+                          << "\n  KILLER: " << *SI << '\n');
+        State.deleteDeadInstruction(NI);
+        ++NumFastStores;
+        MadeChange = true;
+      } else
+        Current = NextDef;
+    }
+  }
+
+  return MadeChange;
+}
+} // end anonymous namespace
+
 //===----------------------------------------------------------------------===//
 // DSE Pass
 //===----------------------------------------------------------------------===//
 PreservedAnalyses DSEPass::run(Function &F, FunctionAnalysisManager &AM) {
-  AliasAnalysis *AA = &AM.getResult<AAManager>(F);
-  DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
-  MemoryDependenceResults *MD = &AM.getResult<MemoryDependenceAnalysis>(F);
-  const TargetLibraryInfo *TLI = &AM.getResult<TargetLibraryAnalysis>(F);
+  AliasAnalysis &AA = AM.getResult<AAManager>(F);
+  const TargetLibraryInfo &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
+
+  if (EnableMemorySSA) {
+    MemorySSA &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
+    PostDominatorTree &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
+
+    if (!eliminateDeadStoresMemorySSA(F, AA, MSSA, DT, PDT, TLI))
+      return PreservedAnalyses::all();
+  } else {
+    MemoryDependenceResults &MD = AM.getResult<MemoryDependenceAnalysis>(F);
 
-  if (!eliminateDeadStores(F, AA, MD, DT, TLI))
-    return PreservedAnalyses::all();
+    if (!eliminateDeadStores(F, &AA, &MD, &DT, &TLI))
+      return PreservedAnalyses::all();
+  }
 
   PreservedAnalyses PA;
   PA.preserveSet<CFGAnalyses>();
   PA.preserve<GlobalsAA>();
-  PA.preserve<MemoryDependenceAnalysis>();
+  if (EnableMemorySSA)
+    PA.preserve<MemorySSAAnalysis>();
+  else
+    PA.preserve<MemoryDependenceAnalysis>();
   return PA;
 }
 
@@ -1388,25 +1869,42 @@ public:
     if (skipFunction(F))
       return false;
 
-    DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-    AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
-    MemoryDependenceResults *MD =
-        &getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
-    const TargetLibraryInfo *TLI =
-        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+    AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+    DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    const TargetLibraryInfo &TLI =
+        getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+
+    if (EnableMemorySSA) {
+      MemorySSA &MSSA = getAnalysis<MemorySSAWrapperPass>().getMSSA();
+      PostDominatorTree &PDT =
+          getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
+
+      return eliminateDeadStoresMemorySSA(F, AA, MSSA, DT, PDT, TLI);
+    } else {
+      MemoryDependenceResults &MD =
+          getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
 
-    return eliminateDeadStores(F, AA, MD, DT, TLI);
+      return eliminateDeadStores(F, &AA, &MD, &DT, &TLI);
+    }
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
-    AU.addRequired<DominatorTreeWrapperPass>();
     AU.addRequired<AAResultsWrapperPass>();
-    AU.addRequired<MemoryDependenceWrapperPass>();
     AU.addRequired<TargetLibraryInfoWrapperPass>();
-    AU.addPreserved<DominatorTreeWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
-    AU.addPreserved<MemoryDependenceWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+
+    if (EnableMemorySSA) {
+      AU.addRequired<PostDominatorTreeWrapperPass>();
+      AU.addRequired<MemorySSAWrapperPass>();
+      AU.addPreserved<PostDominatorTreeWrapperPass>();
+      AU.addPreserved<MemorySSAWrapperPass>();
+    } else {
+      AU.addRequired<MemoryDependenceWrapperPass>();
+      AU.addPreserved<MemoryDependenceWrapperPass>();
+    }
   }
 };
 
@@ -1417,8 +1915,10 @@ char DSELegacyPass::ID = 0;
 INITIALIZE_PASS_BEGIN(DSELegacyPass, "dse", "Dead Store Elimination", false,
                       false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(DSELegacyPass, "dse", "Dead Store Elimination", false,
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/2011-09-06-EndOfFunction.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/2011-09-06-EndOfFunction.ll
index b9a0ea7..0d84e9c 100644
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/2011-09-06-EndOfFunction.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/2011-09-06-EndOfFunction.ll
@@ -1,3 +1,4 @@
+; XFAIL: *
 ; RUN: opt -dse -enable-dse-memoryssa -S < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/OverwriteStoreBegin.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/OverwriteStoreBegin.ll
index a7278d5..1028c8e 100644
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/OverwriteStoreBegin.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/OverwriteStoreBegin.ll
@@ -1,4 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; XFAIL: *
 ; RUN: opt < %s -basicaa -dse -enable-dse-memoryssa -S | FileCheck %s
 
 define void @write4to7(i32* nocapture %p) {
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/OverwriteStoreEnd.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/OverwriteStoreEnd.ll
index d8791aa..0711855 100644
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/OverwriteStoreEnd.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/OverwriteStoreEnd.ll
@@ -1,4 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; XFAIL: *
 ; RUN: opt < %s -basicaa -dse -enable-dse-memoryssa -S | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/atomic.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/atomic.ll
index b21183c..9558033 100644
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/atomic.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/atomic.ll
@@ -1,3 +1,4 @@
+; XFAIL: *
 ; RUN: opt -basicaa -dse -enable-dse-memoryssa -S < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/calloc-store.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/calloc-store.ll
index 19ebaa9..397c594 100644
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/calloc-store.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/calloc-store.ll
@@ -1,3 +1,5 @@
+; XFAIL: *
+
 ; RUN: opt < %s -basicaa -dse -enable-dse-memoryssa -S | FileCheck %s
 
 declare noalias i8* @calloc(i64, i64)
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/fence-todo.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/fence-todo.ll
new file mode 100644
index 0000000..4ee2958
--- /dev/null
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/fence-todo.ll
@@ -0,0 +1,50 @@
+; XFAIL: *
+
+; RUN: opt -S -basicaa -dse -enable-dse-memoryssa < %s | FileCheck %s
+
+; We DSE stack alloc'ed and byval locations, in the presence of fences.
+; Fence does not make an otherwise thread local store visible.
+; Right now the DSE in presence of fence is only done in end blocks (with no successors),
+; but the same logic applies to other basic blocks as well.
+; The store to %addr.i can be removed since it is a byval attribute
+define void @test3(i32* byval %addr.i) {
+; CHECK-LABEL: @test3
+; CHECK-NOT: store
+; CHECK: fence
+; CHECK: ret
+  store i32 5, i32* %addr.i, align 4
+  fence release
+  ret void
+}
+
+declare void @foo(i8* nocapture %p)
+
+declare noalias i8* @malloc(i32)
+
+; DSE of stores in locations allocated through library calls.
+define void @test_nocapture() {
+; CHECK-LABEL: @test_nocapture
+; CHECK: malloc
+; CHECK: foo
+; CHECK-NOT: store
+; CHECK: fence
+  %m  =  call i8* @malloc(i32 24)
+  call void @foo(i8* %m)
+  store i8 4, i8* %m
+  fence release
+  ret void
+}
+
+
+; This is a full fence, but it does not make a thread local store visible.
+; We can DSE the store in presence of the fence.
+define void @fence_seq_cst() {
+; CHECK-LABEL: @fence_seq_cst
+; CHECK-NEXT: fence seq_cst
+; CHECK-NEXT: ret void
+  %P1 = alloca i32
+  store i32 0, i32* %P1, align 4
+  fence seq_cst
+  store i32 4, i32* %P1, align 4
+  ret void
+}
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/fence.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/fence.ll
index 9b43d37..b22fc9c 100644
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/fence.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/fence.ll
@@ -46,51 +46,3 @@ define void @test2(i32* %addr.i) {
   store i32 5, i32* %addr.i, align 4
   ret void
 }
-
-; We DSE stack alloc'ed and byval locations, in the presence of fences.
-; Fence does not make an otherwise thread local store visible.
-; Right now the DSE in presence of fence is only done in end blocks (with no successors),
-; but the same logic applies to other basic blocks as well.
-; The store to %addr.i can be removed since it is a byval attribute
-define void @test3(i32* byval %addr.i) {
-; CHECK-LABEL: @test3
-; CHECK-NOT: store
-; CHECK: fence
-; CHECK: ret
-  store i32 5, i32* %addr.i, align 4
-  fence release
-  ret void
-}
-
-declare void @foo(i8* nocapture %p)
-
-declare noalias i8* @malloc(i32)
-
-; DSE of stores in locations allocated through library calls.
-define void @test_nocapture() {
-; CHECK-LABEL: @test_nocapture
-; CHECK: malloc
-; CHECK: foo
-; CHECK-NOT: store
-; CHECK: fence
-  %m  =  call i8* @malloc(i32 24)
-  call void @foo(i8* %m)
-  store i8 4, i8* %m
-  fence release
-  ret void
-}
-
-
-; This is a full fence, but it does not make a thread local store visible.
-; We can DSE the store in presence of the fence.
-define void @fence_seq_cst() {
-; CHECK-LABEL: @fence_seq_cst
-; CHECK-NEXT: fence seq_cst
-; CHECK-NEXT: ret void
-  %P1 = alloca i32
-  store i32 0, i32* %P1, align 4
-  fence seq_cst
-  store i32 4, i32* %P1, align 4
-  ret void
-}
-
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/free.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/free.ll
index a9a3234..a6e47d7 100644
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/free.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/free.ll
@@ -1,3 +1,5 @@
+; XFAIL: *
+
 ; RUN: opt < %s -basicaa -dse -enable-dse-memoryssa -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64"
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/inst-limits.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/inst-limits.ll
index 78fe292..638571f 100644
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/inst-limits.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/inst-limits.ll
@@ -1,10 +1,9 @@
 ; RUN: opt -S -dse -enable-dse-memoryssa < %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-; If there are two stores to the same location, DSE should be able to remove
-; the first store if the two stores are separated by no more than 98
-; instructions. The existence of debug intrinsics between the stores should
-; not affect this instruction limit.
+; This test is not relevant for DSE with MemorySSA. Non-memory instructions
+; are ignored anyways. The limits for the MemorySSA traversal are tested in
+; llvm/test/Transforms/DeadStoreElimination/MSSA/memoryssa-scan-limit.ll
 
 @x = global i32 0, align 4
 
@@ -129,7 +128,7 @@ entry:
 define i32 @test_outside_limit() {
 entry:
   ; The first store; later there is a second store to the same location
-  ; CHECK: store i32 1, i32* @x, align 4
+  ; CHECK-NOT: store i32 1, i32* @x, align 4
   store i32 1, i32* @x, align 4
 
   ; Insert 99 dummy instructions between the two stores; this is
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/lifetime.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/lifetime.ll
index 3b3f4d4..e4d4722 100644
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/lifetime.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/lifetime.ll
@@ -1,3 +1,5 @@
+; XFAIL: *
+
 ; RUN: opt -S -basicaa -dse -enable-dse-memoryssa < %s | FileCheck %s
 
 target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/mda-with-dbg-values.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/mda-with-dbg-values.ll
index 52e2ecd..06bcc34 100644
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/mda-with-dbg-values.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/mda-with-dbg-values.ll
@@ -1,13 +1,12 @@
-; RUN: opt -S -dse -enable-dse-memoryssa -memdep-block-scan-limit=3 < %s | FileCheck %s
-; RUN: opt -S -strip-debug -dse -enable-dse-memoryssa -memdep-block-scan-limit=3 < %s | FileCheck %s
+; RUN: opt -S -dse -enable-dse-memoryssa -dse-memoryssa-scanlimit=2 < %s | FileCheck %s
+; RUN: opt -S -strip-debug -dse -enable-dse-memoryssa -dse-memoryssa-scanlimit=2 < %s | FileCheck %s
 
-; Test case to check that the memory dependency analysis gets the same
-; result even if we have a dbg value between the memcpy and
-; store. The memory dependency is then used by DSE to remove the store.
+; Test case to check that DSE gets the same result even if we have a dbg value
+; between the memcpy.
 
-; We use -memdep-block-scan-limit=3 to be able to create a small test case.
-; Without it, we would need to squeeze in 100 instructions since the default
-; limit is 100.
+; This test case is less relevant for the MemorySSA backed version of DSE, as
+; debug values are not modeled in MemorySSA and are skipped regardless of the
+; exploration limit.
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -20,6 +19,11 @@ entry:
   %i = alloca i8, align 1
   store i8 1, i8* %i, align 1, !dbg !19
   call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !17, metadata !DIExpression()), !dbg !18
+  call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !17, metadata !DIExpression()), !dbg !18
+  call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !17, metadata !DIExpression()), !dbg !18
+  call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !17, metadata !DIExpression()), !dbg !18
+  call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !17, metadata !DIExpression()), !dbg !18
+  call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !17, metadata !DIExpression()), !dbg !18
   %0 = bitcast [1 x i8]* @g to i8*
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %i, i8* %0, i64 1, i1 false), !dbg !20
   br label %bb2
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/memcpy-complete-overwrite.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/memcpy-complete-overwrite.ll
index 38cb8a3..9315972 100644
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/memcpy-complete-overwrite.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/memcpy-complete-overwrite.ll
@@ -1,4 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+
+; XFAIL: *
 ; RUN: opt < %s -basicaa -dse -enable-dse-memoryssa -S | FileCheck %s
 ; RUN: opt < %s -aa-pipeline=basic-aa -passes=dse -enable-dse-memoryssa -S | FileCheck %s
 target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/memintrinsics.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/memintrinsics.ll
index 51c45c3..c22ad19 100644
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/memintrinsics.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/memintrinsics.ll
@@ -1,4 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; XFAIL: *
 ; RUN: opt -S -dse -enable-dse-memoryssa < %s | FileCheck %s
 
 declare void @llvm.memcpy.p0i8.p0i8.i8(i8* nocapture, i8* nocapture, i8, i1) nounwind
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/memoryssa-scan-limit.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/memoryssa-scan-limit.ll
new file mode 100644
index 0000000..43a9f0c
--- /dev/null
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/memoryssa-scan-limit.ll
@@ -0,0 +1,72 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -dse -enable-dse-memoryssa -S | FileCheck --check-prefix=NO-LIMIT %s
+; RUN: opt < %s -basicaa -dse -enable-dse-memoryssa -dse-memoryssa-scanlimit=0 -S | FileCheck --check-prefix=LIMIT-0 %s
+; RUN: opt < %s -basicaa -dse -enable-dse-memoryssa -dse-memoryssa-scanlimit=3 -S | FileCheck --check-prefix=LIMIT-3 %s
+; RUN: opt < %s -basicaa -dse -enable-dse-memoryssa -dse-memoryssa-scanlimit=4 -S | FileCheck --check-prefix=LIMIT-4 %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+
+
+define void @test2(i32* noalias %P, i32* noalias %Q, i32* noalias %R) {
+;
+; NO-LIMIT-LABEL: @test2(
+; NO-LIMIT-NEXT:    br i1 true, label [[BB1:%.*]], label [[BB2:%.*]]
+; NO-LIMIT:       bb1:
+; NO-LIMIT-NEXT:    br label [[BB3:%.*]]
+; NO-LIMIT:       bb2:
+; NO-LIMIT-NEXT:    br label [[BB3]]
+; NO-LIMIT:       bb3:
+; NO-LIMIT-NEXT:    store i32 0, i32* [[Q:%.*]]
+; NO-LIMIT-NEXT:    store i32 0, i32* [[R:%.*]]
+; NO-LIMIT-NEXT:    store i32 0, i32* [[P:%.*]]
+; NO-LIMIT-NEXT:    ret void
+;
+; LIMIT-0-LABEL: @test2(
+; LIMIT-0-NEXT:    br i1 true, label [[BB1:%.*]], label [[BB2:%.*]]
+; LIMIT-0:       bb1:
+; LIMIT-0-NEXT:    br label [[BB3:%.*]]
+; LIMIT-0:       bb2:
+; LIMIT-0-NEXT:    br label [[BB3]]
+; LIMIT-0:       bb3:
+; LIMIT-0-NEXT:    store i32 0, i32* [[Q:%.*]]
+; LIMIT-0-NEXT:    store i32 0, i32* [[R:%.*]]
+; LIMIT-0-NEXT:    store i32 0, i32* [[P:%.*]]
+; LIMIT-0-NEXT:    ret void
+;
+; LIMIT-3-LABEL: @test2(
+; LIMIT-3-NEXT:    store i32 1, i32* [[P:%.*]]
+; LIMIT-3-NEXT:    br i1 true, label [[BB1:%.*]], label [[BB2:%.*]]
+; LIMIT-3:       bb1:
+; LIMIT-3-NEXT:    br label [[BB3:%.*]]
+; LIMIT-3:       bb2:
+; LIMIT-3-NEXT:    br label [[BB3]]
+; LIMIT-3:       bb3:
+; LIMIT-3-NEXT:    store i32 0, i32* [[Q:%.*]]
+; LIMIT-3-NEXT:    store i32 0, i32* [[R:%.*]]
+; LIMIT-3-NEXT:    store i32 0, i32* [[P]]
+; LIMIT-3-NEXT:    ret void
+;
+; LIMIT-4-LABEL: @test2(
+; LIMIT-4-NEXT:    br i1 true, label [[BB1:%.*]], label [[BB2:%.*]]
+; LIMIT-4:       bb1:
+; LIMIT-4-NEXT:    br label [[BB3:%.*]]
+; LIMIT-4:       bb2:
+; LIMIT-4-NEXT:    br label [[BB3]]
+; LIMIT-4:       bb3:
+; LIMIT-4-NEXT:    store i32 0, i32* [[Q:%.*]]
+; LIMIT-4-NEXT:    store i32 0, i32* [[R:%.*]]
+; LIMIT-4-NEXT:    store i32 0, i32* [[P:%.*]]
+; LIMIT-4-NEXT:    ret void
+;
+  store i32 1, i32* %P
+  br i1 true, label %bb1, label %bb2
+bb1:
+  br label %bb3
+bb2:
+  br label %bb3
+bb3:
+  store i32 0, i32* %Q
+  store i32 0, i32* %R
+  store i32 0, i32* %P
+  ret void
+}
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/memset-and-memcpy.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/memset-and-memcpy.ll
index be4574e..c142ea6 100644
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/memset-and-memcpy.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/memset-and-memcpy.ll
@@ -59,8 +59,7 @@ define void @test17_atomic_weaker_2(i8* %P, i8* noalias %Q) nounwind ssp {
 ; Should not delete the volatile memset.
 define void @test17v(i8* %P, i8* %Q) nounwind ssp {
 ; CHECK-LABEL: @test17v(
-; CHECK-NEXT:    tail call void @llvm.memset.p0i8.i64(i8* [[P:%.*]], i8 42, i64 8, i1 true)
-; CHECK-NEXT:    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[P]], i8* [[Q:%.*]], i64 12, i1 false)
+; CHECK-NEXT:    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[P:%.*]], i8* [[Q:%.*]], i64 12, i1 false)
 ; CHECK-NEXT:    ret void
 ;
   tail call void @llvm.memset.p0i8.i64(i8* %P, i8 42, i64 8, i1 true)
@@ -74,8 +73,7 @@ define void @test17v(i8* %P, i8* %Q) nounwind ssp {
 ; Previously this was not allowed (PR8728), also discussed in PR11763.
 define void @test18(i8* %P, i8* %Q, i8* %R) nounwind ssp {
 ; CHECK-LABEL: @test18(
-; CHECK-NEXT:    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[P:%.*]], i8* [[Q:%.*]], i64 12, i1 false)
-; CHECK-NEXT:    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[P]], i8* [[R:%.*]], i64 12, i1 false)
+; CHECK-NEXT:    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[P:%.*]], i8* [[R:%.*]], i64 12, i1 false)
 ; CHECK-NEXT:    ret void
 ;
   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i1 false)
@@ -85,8 +83,7 @@ define void @test18(i8* %P, i8* %Q, i8* %R) nounwind ssp {
 
 define void @test18_atomic(i8* %P, i8* %Q, i8* %R) nounwind ssp {
 ; CHECK-LABEL: @test18_atomic(
-; CHECK-NEXT:    tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[P:%.*]], i8* align 1 [[Q:%.*]], i64 12, i32 1)
-; CHECK-NEXT:    tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[P]], i8* align 1 [[R:%.*]], i64 12, i32 1)
+; CHECK-NEXT:    tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[P:%.*]], i8* align 1 [[R:%.*]], i64 12, i32 1)
 ; CHECK-NEXT:    ret void
 ;
   tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %P, i8* align 1 %Q, i64 12, i32 1)
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/memset-missing-debugloc.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/memset-missing-debugloc.ll
index 11d155c..d1aaab9 100644
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/memset-missing-debugloc.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/memset-missing-debugloc.ll
@@ -1,6 +1,7 @@
 ; Test that the getelementptr generated when the dse pass determines that
 ; a memset can be shortened has the debugloc carried over from the memset.
 
+; XFAIL: *
 ; RUN: opt -S -march=native -dse -enable-dse-memoryssa < %s| FileCheck %s
 ; CHECK: bitcast [5 x i64]* %{{[a-zA-Z_][a-zA-Z0-9_]*}} to i8*, !dbg
 ; CHECK-NEXT: %{{[0-9]+}} = getelementptr inbounds i8, i8* %0, i64 32, !dbg ![[DBG:[0-9]+]]
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/merge-stores-big-endian.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/merge-stores-big-endian.ll
index 8acc29f..055f117 100644
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/merge-stores-big-endian.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/merge-stores-big-endian.ll
@@ -1,4 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; XFAIL: *
 ; RUN: opt -dse -enable-dse-memoryssa -enable-dse-partial-store-merging -S < %s | FileCheck %s
 target datalayout = "E-m:e-i64:64-i128:128-n32:64-S128"
 
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/merge-stores.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/merge-stores.ll
index feb1f31..b09904e 100644
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/merge-stores.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/merge-stores.ll
@@ -1,4 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; XFAIL: *
 ; RUN: opt -dse -enable-dse-memoryssa -enable-dse-partial-store-merging -S < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-f128:128:128-n8:16:32:64"
 
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-captures.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-captures.ll
index 7caf481..c1f7392 100644
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-captures.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-captures.ll
@@ -25,7 +25,6 @@ define i8* @test_return_captures_1() {
 define i8* @test_return_captures_2() {
 ; CHECK-LABEL: @test_return_captures_2(
 ; CHECK-NEXT:    [[M:%.*]] = call i8* @malloc(i64 24)
-; CHECK-NEXT:    store i8 0, i8* [[M]]
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    store i8 1, i8* [[M]]
@@ -91,7 +90,6 @@ exit:
 define i8* @test_malloc_capture_3() {
 ; CHECK-LABEL: @test_malloc_capture_3(
 ; CHECK-NEXT:    [[M:%.*]] = call i8* @malloc(i64 24)
-; CHECK-NEXT:    store i8 0, i8* [[M]]
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    store i8 1, i8* [[M]]
@@ -191,7 +189,6 @@ exit:
 define i8* @test_malloc_capture_7() {
 ; CHECK-LABEL: @test_malloc_capture_7(
 ; CHECK-NEXT:    [[M:%.*]] = call i8* @malloc(i64 24)
-; CHECK-NEXT:    store i8 0, i8* [[M]]
 ; CHECK-NEXT:    call void @may_throw()
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       exit:
@@ -215,10 +212,10 @@ exit:
 define void @test_alloca_nocapture_1() {
 ; CHECK-LABEL: @test_alloca_nocapture_1(
 ; CHECK-NEXT:    [[M:%.*]] = alloca i8
-; CHECK-NEXT:    store i8 0, i8* [[M]]
 ; CHECK-NEXT:    call void @foo()
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       exit:
+; CHECK-NEXT:    store i8 1, i8* [[M]]
 ; CHECK-NEXT:    ret void
 ;
   %m = alloca i8
@@ -240,6 +237,7 @@ define void @test_alloca_capture_1() {
 ; CHECK-NEXT:    call void @capture(i8* [[M]])
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       exit:
+; CHECK-NEXT:    store i8 1, i8* [[M]]
 ; CHECK-NEXT:    ret void
 ;
   %m = alloca i8
@@ -262,6 +260,7 @@ define void @test_alloca_capture_2(%S1* %E) {
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[F_PTR:%.*]] = getelementptr [[S1:%.*]], %S1* [[E:%.*]], i32 0, i32 0
 ; CHECK-NEXT:    store i8* [[M]], i8** [[F_PTR]]
+; CHECK-NEXT:    store i8 1, i8* [[M]]
 ; CHECK-NEXT:    ret void
 ;
   %m = alloca i8
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-exceptions.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-exceptions.ll
index 41d4d019..a719e27 100644
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-exceptions.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-exceptions.ll
@@ -30,6 +30,7 @@ define void @test12(i32* %p) personality i32 (...)* @__CxxFrameHandler3 {
 ; CHECK-NEXT:    [[C1:%.*]] = cleanuppad within none []
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
+; CHECK-NEXT:    store i32 40, i32* [[SV]]
 ; CHECK-NEXT:    ret void
 ;
 block1:
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-loops.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-loops.ll
index bcf75e3..1ec4434 100644
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-loops.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-loops.ll
@@ -111,7 +111,6 @@ define void @test_loop(i32 %N, i32* noalias nocapture readonly %A, i32* noalias
 ; CHECK:       for.body4.lr.ph:
 ; CHECK-NEXT:    [[I_028:%.*]] = phi i32 [ [[INC11:%.*]], [[FOR_COND_CLEANUP3:%.*]] ], [ 0, [[FOR_BODY4_LR_PH_PREHEADER]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[I_028]]
-; CHECK-NEXT:    store i32 0, i32* [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[I_028]], [[N]]
 ; CHECK-NEXT:    br label [[FOR_BODY4:%.*]]
 ; CHECK:       for.body4:
@@ -170,3 +169,116 @@ for.cond.cleanup3:                                ; preds = %for.body4
   %exitcond29 = icmp eq i32 %inc11, %N
   br i1 %exitcond29, label %for.cond.cleanup, label %for.body4.lr.ph
 }
+
+declare i1 @cond() readnone
+
+; TODO: We can eliminate the store in for.header, but we currently hit a MemoryPhi.
+define void @loop_multiple_def_uses(i32* noalias %P) {
+; CHECK-LABEL: @loop_multiple_def_uses(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_HEADER:%.*]]
+; CHECK:       for.header:
+; CHECK-NEXT:    store i32 1, i32* [[P:%.*]], align 4
+; CHECK-NEXT:    [[C1:%.*]] = call i1 @cond()
+; CHECK-NEXT:    br i1 [[C1]], label [[FOR_BODY:%.*]], label [[END:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    store i32 1, i32* [[P]], align 4
+; CHECK-NEXT:    [[LV:%.*]] = load i32, i32* [[P]]
+; CHECK-NEXT:    br label [[FOR_HEADER]]
+; CHECK:       end:
+; CHECK-NEXT:    store i32 3, i32* [[P]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.header
+
+for.header:
+  store i32 1, i32* %P, align 4
+  %c1 = call i1 @cond()
+  br i1 %c1, label %for.body, label %end
+
+for.body:
+  store i32 1, i32* %P, align 4
+  %lv = load i32, i32* %P
+  br label %for.header
+
+end:
+  store i32 3, i32* %P, align 4
+  ret void
+}
+
+; We cannot eliminate the store in for.header, as it is only partially
+; overwritten in for.body and read afterwards.
+define void @loop_multiple_def_uses_partial_write(i32* noalias %p) {
+; CHECK-LABEL: @loop_multiple_def_uses_partial_write(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_HEADER:%.*]]
+; CHECK:       for.header:
+; CHECK-NEXT:    store i32 1239491, i32* [[P:%.*]], align 4
+; CHECK-NEXT:    [[C1:%.*]] = call i1 @cond()
+; CHECK-NEXT:    br i1 [[C1]], label [[FOR_BODY:%.*]], label [[END:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[C:%.*]] = bitcast i32* [[P]] to i8*
+; CHECK-NEXT:    store i8 1, i8* [[C]], align 4
+; CHECK-NEXT:    [[LV:%.*]] = load i32, i32* [[P]]
+; CHECK-NEXT:    br label [[FOR_HEADER]]
+; CHECK:       end:
+; CHECK-NEXT:    store i32 3, i32* [[P]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.header
+
+for.header:
+  store i32 1239491, i32* %p, align 4
+  %c1 = call i1 @cond()
+  br i1 %c1, label %for.body, label %end
+
+for.body:
+  %c = bitcast i32* %p to i8*
+  store i8 1, i8* %c, align 4
+  %lv = load i32, i32* %p
+  br label %for.header
+
+end:
+  store i32 3, i32* %p, align 4
+  ret void
+}
+
+; We cannot eliminate the store in for.header, as the location is not overwritten
+; in for.body and read afterwards.
+define void @loop_multiple_def_uses_mayalias_write(i32* %p, i32* %q) {
+
+; CHECK-LABEL: @loop_multiple_def_uses_mayalias_write(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_HEADER:%.*]]
+; CHECK:       for.header:
+; CHECK-NEXT:    store i32 1239491, i32* [[P:%.*]], align 4
+; CHECK-NEXT:    [[C1:%.*]] = call i1 @cond()
+; CHECK-NEXT:    br i1 [[C1]], label [[FOR_BODY:%.*]], label [[END:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    store i32 1, i32* [[Q:%.*]], align 4
+; CHECK-NEXT:    [[LV:%.*]] = load i32, i32* [[P]]
+; CHECK-NEXT:    br label [[FOR_HEADER]]
+; CHECK:       end:
+; CHECK-NEXT:    store i32 3, i32* [[P]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.header
+
+for.header:
+  store i32 1239491, i32* %p, align 4
+  %c1 = call i1 @cond()
+  br i1 %c1, label %for.body, label %end
+
+for.body:
+  store i32 1, i32* %q, align 4
+  %lv = load i32, i32* %p
+  br label %for.header
+
+end:
+  store i32 3, i32* %p, align 4
+  ret void
+}
+
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-memoryphis.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-memoryphis.ll
index ddfc4d4..ce34986 100644
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-memoryphis.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-memoryphis.ll
@@ -103,3 +103,73 @@ bb3:
   store i8 1, i8* %P2
   ret void
 }
+
+declare void @hoge()
+
+; Check a function with a MemoryPhi with 3 incoming values.
+define void @widget(i32* %Ptr, i1 %c1, i1 %c2, i32 %v1, i32 %v2, i32 %v3) {
+; CHECK-LABEL: @widget(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    tail call void @hoge()
+; CHECK-NEXT:    br i1 [[C1:%.*]], label [[BB3:%.*]], label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br i1 [[C2:%.*]], label [[BB2:%.*]], label [[BB3]]
+; CHECK:       bb2:
+; CHECK-NEXT:    store i32 -1, i32* [[PTR:%.*]], align 4
+; CHECK-NEXT:    br label [[BB3]]
+; CHECK:       bb3:
+; CHECK-NEXT:    br label [[BB4:%.*]]
+; CHECK:       bb4:
+; CHECK-NEXT:    switch i32 [[V1:%.*]], label [[BB8:%.*]] [
+; CHECK-NEXT:    i32 0, label [[BB5:%.*]]
+; CHECK-NEXT:    i32 1, label [[BB6:%.*]]
+; CHECK-NEXT:    i32 2, label [[BB7:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       bb5:
+; CHECK-NEXT:    store i32 0, i32* [[PTR]], align 4
+; CHECK-NEXT:    br label [[BB8]]
+; CHECK:       bb6:
+; CHECK-NEXT:    store i32 1, i32* [[PTR]], align 4
+; CHECK-NEXT:    br label [[BB8]]
+; CHECK:       bb7:
+; CHECK-NEXT:    store i32 2, i32* [[PTR]], align 4
+; CHECK-NEXT:    br label [[BB8]]
+; CHECK:       bb8:
+; CHECK-NEXT:    br label [[BB4]]
+;
+bb:
+  tail call void @hoge()
+  br i1 %c1, label %bb3, label %bb1
+
+bb1:                                              ; preds = %bb
+  br i1 %c2, label %bb2, label %bb3
+
+bb2:                                              ; preds = %bb1
+  store i32 -1, i32* %Ptr, align 4
+  br label %bb3
+
+bb3:                                              ; preds = %bb2, %bb1, %bb
+  br label %bb4
+
+bb4:                                              ; preds = %bb8, %bb3
+  switch i32 %v1, label %bb8 [
+  i32 0, label %bb5
+  i32 1, label %bb6
+  i32 2, label %bb7
+  ]
+
+bb5:                                              ; preds = %bb4
+  store i32 0, i32* %Ptr, align 4
+  br label %bb8
+
+bb6:                                              ; preds = %bb4
+  store i32 1, i32* %Ptr, align 4
+  br label %bb8
+
+bb7:                                              ; preds = %bb4
+  store i32 2, i32* %Ptr, align 4
+  br label %bb8
+
+bb8:                                              ; preds = %bb7, %bb6, %bb5, %bb4
+  br label %bb4
+}
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-partial.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-partial.ll
index 17bd8a6..cbd7dae 100644
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-partial.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-partial.ll
@@ -32,14 +32,13 @@ bb3:
 
 define void @second_store_bigger(i32* noalias %P) {
 ; CHECK-LABEL: @second_store_bigger(
-; CHECK-NEXT:    store i32 1, i32* [[P:%.*]]
 ; CHECK-NEXT:    br i1 true, label [[BB1:%.*]], label [[BB2:%.*]]
 ; CHECK:       bb1:
 ; CHECK-NEXT:    br label [[BB3:%.*]]
 ; CHECK:       bb2:
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
-; CHECK-NEXT:    [[P_I64:%.*]] = bitcast i32* [[P]] to i64*
+; CHECK-NEXT:    [[P_I64:%.*]] = bitcast i32* [[P:%.*]] to i64*
 ; CHECK-NEXT:    store i64 0, i64* [[P_I64]]
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-simple.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-simple.ll
index 2b02277..520a6ea 100644
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-simple.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-simple.ll
@@ -6,14 +6,13 @@ target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 
 define void @test2(i32* noalias %P) {
 ; CHECK-LABEL: @test2(
-; CHECK-NEXT:    store i32 1, i32* [[P:%.*]]
 ; CHECK-NEXT:    br i1 true, label [[BB1:%.*]], label [[BB2:%.*]]
 ; CHECK:       bb1:
 ; CHECK-NEXT:    br label [[BB3:%.*]]
 ; CHECK:       bb2:
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
-; CHECK-NEXT:    store i32 0, i32* [[P]]
+; CHECK-NEXT:    store i32 0, i32* [[P:%.*]]
 ; CHECK-NEXT:    ret void
 ;
   store i32 1, i32* %P
@@ -53,7 +52,6 @@ bb3:
 
 define void @test7(i32* noalias %P, i32* noalias %Q) {
 ; CHECK-LABEL: @test7(
-; CHECK-NEXT:    store i32 1, i32* [[Q:%.*]]
 ; CHECK-NEXT:    br i1 true, label [[BB1:%.*]], label [[BB2:%.*]]
 ; CHECK:       bb1:
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[P:%.*]]
@@ -61,7 +59,7 @@ define void @test7(i32* noalias %P, i32* noalias %Q) {
 ; CHECK:       bb2:
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
-; CHECK-NEXT:    store i32 0, i32* [[Q]]
+; CHECK-NEXT:    store i32 0, i32* [[Q:%.*]]
 ; CHECK-NEXT:    store i32 0, i32* [[P]]
 ; CHECK-NEXT:    ret void
 ;
@@ -114,3 +112,38 @@ bb3:
   store i32 0, i32* %P
   ret void
 }
+
+; We cannot eliminate `store i32 0, i32* %P`, as it is read by the later load.
+; Make sure that we check the uses of `store i32 1, i32* %P.1 which does not
+; alias %P. Note that uses point to the *first* def that may alias.
+define void @overlapping_read(i32* %P) {
+; CHECK-LABEL: @overlapping_read(
+; CHECK-NEXT:    store i32 0, i32* [[P:%.*]]
+; CHECK-NEXT:    [[P_1:%.*]] = getelementptr i32, i32* [[P]], i32 1
+; CHECK-NEXT:    store i32 1, i32* [[P_1]]
+; CHECK-NEXT:    [[P_64:%.*]] = bitcast i32* [[P]] to i64*
+; CHECK-NEXT:    [[LV:%.*]] = load i64, i64* [[P_64]]
+; CHECK-NEXT:    br i1 true, label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    ret void
+; CHECK:       bb3:
+; CHECK-NEXT:    store i32 2, i32* [[P]]
+; CHECK-NEXT:    ret void
+;
+  store i32 0, i32* %P
+  %P.1 = getelementptr i32, i32* %P, i32 1
+  store i32 1, i32* %P.1
+
+  %P.64 = bitcast i32* %P to i64*
+  %lv = load i64, i64* %P.64
+  br i1 true, label %bb1, label %bb2
+bb1:
+  br label %bb3
+bb2:
+  ret void
+bb3:
+  store i32 2, i32* %P
+  ret void
+}
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/operand-bundles.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/operand-bundles.ll
index 9e88651..3a59286 100644
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/operand-bundles.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/operand-bundles.ll
@@ -1,3 +1,4 @@
+; XFAIL: *
 ; RUN: opt < %s -basicaa -dse -enable-dse-memoryssa -S | FileCheck %s
 
 declare noalias i8* @malloc(i64) "malloc-like"
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/simple-todo.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/simple-todo.ll
index b96cc9f..d0a3cef 100644
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/simple-todo.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/simple-todo.ll
@@ -1,4 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; XFAIL: *
 ; RUN: opt < %s -basicaa -dse -enable-dse-memoryssa -S | FileCheck %s
 ; RUN: opt < %s -aa-pipeline=basic-aa -passes=dse -enable-dse-memoryssa -S | FileCheck %s
 target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
@@ -9,19 +10,6 @@ declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) n
 declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind
 declare void @llvm.init.trampoline(i8*, i8*, i8*)
 
-; PR8576 - Should delete store of 10 even though p/q are may aliases.
-define void @test2(i32 *%p, i32 *%q) {
-; CHECK-LABEL: @test2(
-; CHECK-NEXT:    store i32 20, i32* [[Q:%.*]], align 4
-; CHECK-NEXT:    store i32 30, i32* [[P:%.*]], align 4
-; CHECK-NEXT:    ret void
-;
-  store i32 10, i32* %p, align 4
-  store i32 20, i32* %q, align 4
-  store i32 30, i32* %p, align 4
-  ret void
-}
-
 define void @test5(i32* %Q) {
 ; CHECK-LABEL: @test5(
 ; CHECK-NEXT:    [[A:%.*]] = load volatile i32, i32* [[Q:%.*]]
@@ -32,62 +20,6 @@ define void @test5(i32* %Q) {
   ret void
 }
 
-; Should delete store of 10 even though memset is a may-store to P (P and Q may
-; alias).
-define void @test6(i32 *%p, i8 *%q) {
-; CHECK-LABEL: @test6(
-; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[Q:%.*]], i8 42, i64 900, i1 false)
-; CHECK-NEXT:    store i32 30, i32* [[P:%.*]], align 4
-; CHECK-NEXT:    ret void
-;
-  store i32 10, i32* %p, align 4       ;; dead.
-  call void @llvm.memset.p0i8.i64(i8* %q, i8 42, i64 900, i1 false)
-  store i32 30, i32* %p, align 4
-  ret void
-}
-
-; Should delete store of 10 even though memset is a may-store to P (P and Q may
-; alias).
-define void @test6_atomic(i32* align 4 %p, i8* align 4 %q) {
-; CHECK-LABEL: @test6_atomic(
-; CHECK-NEXT:    call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[Q:%.*]], i8 42, i64 900, i32 4)
-; CHECK-NEXT:    store atomic i32 30, i32* [[P:%.*]] unordered, align 4
-; CHECK-NEXT:    ret void
-;
-  store atomic i32 10, i32* %p unordered, align 4       ;; dead.
-  call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 %q, i8 42, i64 900, i32 4)
-  store atomic i32 30, i32* %p unordered, align 4
-  ret void
-}
-
-; Should delete store of 10 even though memcpy is a may-store to P (P and Q may
-; alias).
-define void @test7(i32 *%p, i8 *%q, i8* noalias %r) {
-; CHECK-LABEL: @test7(
-; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[Q:%.*]], i8* [[R:%.*]], i64 900, i1 false)
-; CHECK-NEXT:    store i32 30, i32* [[P:%.*]], align 4
-; CHECK-NEXT:    ret void
-;
-  store i32 10, i32* %p, align 4       ;; dead.
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %q, i8* %r, i64 900, i1 false)
-  store i32 30, i32* %p, align 4
-  ret void
-}
-
-; Should delete store of 10 even though memcpy is a may-store to P (P and Q may
-; alias).
-define void @test7_atomic(i32* align 4 %p, i8* align 4 %q, i8* noalias align 4 %r) {
-; CHECK-LABEL: @test7_atomic(
-; CHECK-NEXT:    call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 4 [[Q:%.*]], i8* align 4 [[R:%.*]], i64 900, i32 4)
-; CHECK-NEXT:    store atomic i32 30, i32* [[P:%.*]] unordered, align 4
-; CHECK-NEXT:    ret void
-;
-  store atomic i32 10, i32* %p unordered, align 4       ;; dead.
-  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 4 %q, i8* align 4 %r, i64 900, i32 4)
-  store atomic i32 30, i32* %p unordered, align 4
-  ret void
-}
-
 ; Do not delete stores that are only partially killed.
 define i32 @test8() {
 ; CHECK-LABEL: @test8(
@@ -158,46 +90,6 @@ define void @test12({ i32, i32 }* %x) nounwind  {
 }
 
 
-; %P doesn't escape, the DEAD instructions should be removed.
-declare void @test13f()
-define i32* @test13() {
-; CHECK-LABEL: @test13(
-; CHECK-NEXT:    [[PTR:%.*]] = tail call i8* @malloc(i32 4)
-; CHECK-NEXT:    [[P:%.*]] = bitcast i8* [[PTR]] to i32*
-; CHECK-NEXT:    call void @test13f()
-; CHECK-NEXT:    store i32 0, i32* [[P]]
-; CHECK-NEXT:    ret i32* [[P]]
-;
-  %ptr = tail call i8* @malloc(i32 4)
-  %P = bitcast i8* %ptr to i32*
-  %DEAD = load i32, i32* %P
-  %DEAD2 = add i32 %DEAD, 1
-  store i32 %DEAD2, i32* %P
-  call void @test13f( )
-  store i32 0, i32* %P
-  ret i32* %P
-}
-
-define i32 addrspace(1)* @test13_addrspacecast() {
-; CHECK-LABEL: @test13_addrspacecast(
-; CHECK-NEXT:    [[P:%.*]] = tail call i8* @malloc(i32 4)
-; CHECK-NEXT:    [[P_BC:%.*]] = bitcast i8* [[P]] to i32*
-; CHECK-NEXT:    [[P:%.*]] = addrspacecast i32* [[P_BC]] to i32 addrspace(1)*
-; CHECK-NEXT:    call void @test13f()
-; CHECK-NEXT:    store i32 0, i32 addrspace(1)* [[P]]
-; CHECK-NEXT:    ret i32 addrspace(1)* [[P]]
-;
-  %p = tail call i8* @malloc(i32 4)
-  %p.bc = bitcast i8* %p to i32*
-  %P = addrspacecast i32* %p.bc to i32 addrspace(1)*
-  %DEAD = load i32, i32 addrspace(1)* %P
-  %DEAD2 = add i32 %DEAD, 1
-  store i32 %DEAD2, i32 addrspace(1)* %P
-  call void @test13f( )
-  store i32 0, i32 addrspace(1)* %P
-  ret i32 addrspace(1)* %P
-}
-
 declare noalias i8* @malloc(i32)
 declare noalias i8* @calloc(i32, i32)
 
@@ -241,26 +133,6 @@ define void @test22(i1 %i, i32 %k, i32 %m) nounwind {
   ret void
 }
 
-; Make sure same sized store to later element is deleted
-define void @test24([2 x i32]* %a, i32 %b, i32 %c) nounwind {
-; CHECK-LABEL: @test24(
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[A:%.*]], i64 0, i64 0
-; CHECK-NEXT:    store i32 [[B:%.*]], i32* [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[A]], i64 0, i64 1
-; CHECK-NEXT:    store i32 [[C:%.*]], i32* [[TMP2]], align 4
-; CHECK-NEXT:    ret void
-;
-  %1 = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 0, i64 0
-  store i32 0, i32* %1, align 4
-  %2 = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 0, i64 1
-  store i32 0, i32* %2, align 4
-  %3 = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 0, i64 0
-  store i32 %b, i32* %3, align 4
-  %4 = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 0, i64 1
-  store i32 %c, i32* %4, align 4
-  ret void
-}
-
 ; Remove redundant store if loaded value is in another block.
 define i32 @test26(i1 %c, i32* %p) {
 ; CHECK-LABEL: @test26(
@@ -311,35 +183,6 @@ bb3:
 
 declare void @unknown_func()
 
-; Don't remove redundant store because of unknown call.
-define i32 @test30(i1 %c, i32* %p, i32 %i) {
-; CHECK-LABEL: @test30(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[V:%.*]] = load i32, i32* [[P:%.*]], align 4
-; CHECK-NEXT:    br i1 [[C:%.*]], label [[BB1:%.*]], label [[BB2:%.*]]
-; CHECK:       bb1:
-; CHECK-NEXT:    br label [[BB3:%.*]]
-; CHECK:       bb2:
-; CHECK-NEXT:    call void @unknown_func()
-; CHECK-NEXT:    br label [[BB3]]
-; CHECK:       bb3:
-; CHECK-NEXT:    store i32 [[V]], i32* [[P]], align 4
-; CHECK-NEXT:    ret i32 0
-;
-entry:
-  %v = load i32, i32* %p, align 4
-  br i1 %c, label %bb1, label %bb2
-bb1:
-  br label %bb3
-bb2:
-  ; Might overwrite value at %p
-  call void @unknown_func()
-  br label %bb3
-bb3:
-  store i32 %v, i32* %p, align 4
-  ret i32 0
-}
-
 ; Remove redundant store if loaded value is in another block inside a loop.
 define i32 @test31(i1 %c, i32* %p, i32 %i) {
 ; CHECK-LABEL: @test31(
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll
index b2c66f2..ef05504 100644
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll
@@ -50,6 +50,76 @@ define void @test4(i32* %Q) {
   ret void
 }
 
+; PR8576 - Should delete store of 10 even though p/q are may aliases.
+define void @test2(i32 *%p, i32 *%q) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    store i32 20, i32* [[Q:%.*]], align 4
+; CHECK-NEXT:    store i32 30, i32* [[P:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+  store i32 10, i32* %p, align 4
+  store i32 20, i32* %q, align 4
+  store i32 30, i32* %p, align 4
+  ret void
+}
+
+; Should delete store of 10 even though memset is a may-store to P (P and Q may
+; alias).
+define void @test6(i32 *%p, i8 *%q) {
+; CHECK-LABEL: @test6(
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[Q:%.*]], i8 42, i64 900, i1 false)
+; CHECK-NEXT:    store i32 30, i32* [[P:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+  store i32 10, i32* %p, align 4       ;; dead.
+  call void @llvm.memset.p0i8.i64(i8* %q, i8 42, i64 900, i1 false)
+  store i32 30, i32* %p, align 4
+  ret void
+}
+
+; Should delete store of 10 even though memset is a may-store to P (P and Q may
+; alias).
+define void @test6_atomic(i32* align 4 %p, i8* align 4 %q) {
+; CHECK-LABEL: @test6_atomic(
+; CHECK-NEXT:    call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 [[Q:%.*]], i8 42, i64 900, i32 4)
+; CHECK-NEXT:    store atomic i32 30, i32* [[P:%.*]] unordered, align 4
+; CHECK-NEXT:    ret void
+;
+  store atomic i32 10, i32* %p unordered, align 4       ;; dead.
+  call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 4 %q, i8 42, i64 900, i32 4)
+  store atomic i32 30, i32* %p unordered, align 4
+  ret void
+}
+
+; Should delete store of 10 even though memcpy is a may-store to P (P and Q may
+; alias).
+define void @test7(i32 *%p, i8 *%q, i8* noalias %r) {
+; CHECK-LABEL: @test7(
+; CHECK-NEXT:    call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[Q:%.*]], i8* [[R:%.*]], i64 900, i1 false)
+; CHECK-NEXT:    store i32 30, i32* [[P:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+  store i32 10, i32* %p, align 4       ;; dead.
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %q, i8* %r, i64 900, i1 false)
+  store i32 30, i32* %p, align 4
+  ret void
+}
+
+; Should delete store of 10 even though memcpy is a may-store to P (P and Q may
+; alias).
+define void @test7_atomic(i32* align 4 %p, i8* align 4 %q, i8* noalias align 4 %r) {
+; CHECK-LABEL: @test7_atomic(
+; CHECK-NEXT:    call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 4 [[Q:%.*]], i8* align 4 [[R:%.*]], i64 900, i32 4)
+; CHECK-NEXT:    store atomic i32 30, i32* [[P:%.*]] unordered, align 4
+; CHECK-NEXT:    ret void
+;
+  store atomic i32 10, i32* %p unordered, align 4       ;; dead.
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 4 %q, i8* align 4 %r, i64 900, i32 4)
+  store atomic i32 30, i32* %p unordered, align 4
+  ret void
+}
+
+
 ; va_arg has fuzzy dependence, the store shouldn't be zapped.
 define double @test10(i8* %X) {
 ; CHECK-LABEL: @test10(
@@ -64,6 +134,46 @@ define double @test10(i8* %X) {
   ret double %tmp.0
 }
 
+; %P doesn't escape, the DEAD instructions should be removed.
+declare void @test13f()
+define i32* @test13() {
+; CHECK-LABEL: @test13(
+; CHECK-NEXT:    [[PTR:%.*]] = tail call i8* @malloc(i32 4)
+; CHECK-NEXT:    [[P:%.*]] = bitcast i8* [[PTR]] to i32*
+; CHECK-NEXT:    call void @test13f()
+; CHECK-NEXT:    store i32 0, i32* [[P]]
+; CHECK-NEXT:    ret i32* [[P]]
+;
+  %ptr = tail call i8* @malloc(i32 4)
+  %P = bitcast i8* %ptr to i32*
+  %DEAD = load i32, i32* %P
+  %DEAD2 = add i32 %DEAD, 1
+  store i32 %DEAD2, i32* %P
+  call void @test13f( )
+  store i32 0, i32* %P
+  ret i32* %P
+}
+
+define i32 addrspace(1)* @test13_addrspacecast() {
+; CHECK-LABEL: @test13_addrspacecast(
+; CHECK-NEXT:    [[P:%.*]] = tail call i8* @malloc(i32 4)
+; CHECK-NEXT:    [[P_BC:%.*]] = bitcast i8* [[P]] to i32*
+; CHECK-NEXT:    [[P:%.*]] = addrspacecast i32* [[P_BC]] to i32 addrspace(1)*
+; CHECK-NEXT:    call void @test13f()
+; CHECK-NEXT:    store i32 0, i32 addrspace(1)* [[P]]
+; CHECK-NEXT:    ret i32 addrspace(1)* [[P]]
+;
+  %p = tail call i8* @malloc(i32 4)
+  %p.bc = bitcast i8* %p to i32*
+  %P = addrspacecast i32* %p.bc to i32 addrspace(1)*
+  %DEAD = load i32, i32 addrspace(1)* %P
+  %DEAD2 = add i32 %DEAD, 1
+  store i32 %DEAD2, i32 addrspace(1)* %P
+  call void @test13f( )
+  store i32 0, i32 addrspace(1)* %P
+  ret i32 addrspace(1)* %P
+}
+
 
 declare noalias i8* @malloc(i32)
 declare noalias i8* @calloc(i32, i32)
@@ -108,6 +218,26 @@ define noalias i8* @test23() nounwind uwtable ssp {
   ret i8* %call
 }
 
+; Make sure same sized store to later element is deleted
+define void @test24([2 x i32]* %a, i32 %b, i32 %c) nounwind {
+; CHECK-LABEL: @test24(
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[A:%.*]], i64 0, i64 0
+; CHECK-NEXT:    store i32 [[B:%.*]], i32* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [2 x i32], [2 x i32]* [[A]], i64 0, i64 1
+; CHECK-NEXT:    store i32 [[C:%.*]], i32* [[TMP2]], align 4
+; CHECK-NEXT:    ret void
+;
+  %1 = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 0, i64 0
+  store i32 0, i32* %1, align 4
+  %2 = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 0, i64 1
+  store i32 0, i32* %2, align 4
+  %3 = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 0, i64 0
+  store i32 %b, i32* %3, align 4
+  %4 = getelementptr inbounds [2 x i32], [2 x i32]* %a, i64 0, i64 1
+  store i32 %c, i32* %4, align 4
+  ret void
+}
+
 ; Check another case like PR13547 where strdup is not like malloc.
 define i8* @test25(i8* %p) nounwind {
 ; CHECK-LABEL: @test25(
@@ -187,6 +317,35 @@ bb3:
 
 declare void @unknown_func()
 
+; Don't remove redundant store because of unknown call.
+define i32 @test30(i1 %c, i32* %p, i32 %i) {
+; CHECK-LABEL: @test30(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[V:%.*]] = load i32, i32* [[P:%.*]], align 4
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    call void @unknown_func()
+; CHECK-NEXT:    br label [[BB3]]
+; CHECK:       bb3:
+; CHECK-NEXT:    store i32 [[V]], i32* [[P]], align 4
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %v = load i32, i32* %p, align 4
+  br i1 %c, label %bb1, label %bb2
+bb1:
+  br label %bb3
+bb2:
+  ; Might overwrite value at %p
+  call void @unknown_func()
+  br label %bb3
+bb3:
+  store i32 %v, i32* %p, align 4
+  ret i32 0
+}
+
 ; Don't remove redundant store in a loop with a may-alias store.
 define i32 @test32(i1 %c, i32* %p, i32 %i) {
 ; CHECK-LABEL: @test32(
@@ -294,8 +453,7 @@ define void @test37_atomic(i8* %P, i8* %Q, i8* %R) {
 ; The memmove is dead, because memcpy arguments cannot overlap.
 define void @test38(i8* %P, i8* %Q, i8* %R) {
 ; CHECK-LABEL: @test38(
-; CHECK-NEXT:    tail call void @llvm.memmove.p0i8.p0i8.i64(i8* [[P:%.*]], i8* [[Q:%.*]], i64 12, i1 false)
-; CHECK-NEXT:    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[P]], i8* [[R:%.*]], i64 12, i1 false)
+; CHECK-NEXT:    tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[P:%.*]], i8* [[R:%.*]], i64 12, i1 false)
 ; CHECK-NEXT:    ret void
 ;
 
@@ -307,8 +465,7 @@ define void @test38(i8* %P, i8* %Q, i8* %R) {
 ; The memmove is dead, because memcpy arguments cannot overlap.
 define void @test38_atomic(i8* %P, i8* %Q, i8* %R) {
 ; CHECK-LABEL: @test38_atomic(
-; CHECK-NEXT:    tail call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[P:%.*]], i8* align 1 [[Q:%.*]], i64 12, i32 1)
-; CHECK-NEXT:    tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[P]], i8* align 1 [[R:%.*]], i64 12, i32 1)
+; CHECK-NEXT:    tail call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[P:%.*]], i8* align 1 [[R:%.*]], i64 12, i32 1)
 ; CHECK-NEXT:    ret void
 ;
 
@@ -379,7 +536,9 @@ declare void @free(i8* nocapture)
 define void @test41(i32* noalias %P) {
 ; CHECK-LABEL: @test41(
 ; CHECK-NEXT:    [[P2:%.*]] = bitcast i32* [[P:%.*]] to i8*
+; CHECK-NEXT:    store i32 1, i32* [[P]]
 ; CHECK-NEXT:    call void @unknown_func()
+; CHECK-NEXT:    store i32 2, i32* [[P]]
 ; CHECK-NEXT:    call void @free(i8* [[P2]])
 ; CHECK-NEXT:    ret void
 ;
-- 
2.7.4