From: Nirav Dave <niravd@google.com>
Date: Tue, 16 May 2017 19:43:56 +0000 (+0000)
Subject: Elide stores which are overwritten without being observed.
X-Git-Tag: llvmorg-5.0.0-rc1~4944
X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=da8f2212731a3a12407a86a462ad84d5fcf86985;p=platform%2Fupstream%2Fllvm.git

Elide stores which are overwritten without being observed.

Summary:
In SelectionDAG, when a store is immediately chained to another store
to the same address, elide the first store as it has no observable
effects. This is causes small improvements dealing with intrinsics
lowered to stores.

Test notes:

* Many testcases overwrite store addresses multiple times and needed
  minor changes, mainly making stores volatile to prevent the
  optimization from optimizing the test away.

* Many X86 test cases optimized out instructions associated with
  associated with va_start.

* Note that test_splat in CodeGen/AArch64/misched-stp.ll no longer has
  dependencies to check and can probably be removed and potentially
  replaced with another test.

Reviewers: rnk, john.brawn

Subscribers: aemerson, rengolin, qcolombet, jyknight, nemanjai, nhaehnle, javed.absar, llvm-commits

Differential Revision: https://reviews.llvm.org/D33206

llvm-svn: 303198
---

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index caf5cb4..0ccee17 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -13087,14 +13087,28 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
     }
   }
 
-  // If this is a store followed by a store with the same value to the same
-  // location, then the store is dead/noop.
   if (StoreSDNode *ST1 = dyn_cast<StoreSDNode>(Chain)) {
-    if (ST1->getBasePtr() == Ptr && ST->getMemoryVT() == ST1->getMemoryVT() &&
-        ST1->getValue() == Value && ST->isUnindexed() && !ST->isVolatile() &&
-        ST1->isUnindexed() && !ST1->isVolatile()) {
-      // The store is dead, remove it.
-      return Chain;
+    if (ST->isUnindexed() && !ST->isVolatile() && ST1->isUnindexed() &&
+        !ST1->isVolatile() && ST1->getBasePtr() == Ptr &&
+        ST->getMemoryVT() == ST1->getMemoryVT()) {
+      // If this is a store followed by a store with the same value to the same
+      // location, then the store is dead/noop.
+      if (ST1->getValue() == Value) {
+        // The store is dead, remove it.
+        return Chain;
+      }
+
+      // If this is a store who's preceeding store to the same location
+      // and no one other node is chained to that store we can effectively
+      // drop the store. Do not remove stores to undef as they may be used as
+      // data sinks.
+      if (OptLevel != CodeGenOpt::None && ST1->hasOneUse() &&
+          !ST1->getBasePtr().isUndef()) {
+        // ST1 is fully overwritten and can be elided. Combine with it's chain
+        // value.
+        CombineTo(ST1, ST1->getChain());
+        return SDValue();
+      }
     }
   }
 
diff --git a/llvm/test/CodeGen/AArch64/ldst-zero.ll b/llvm/test/CodeGen/AArch64/ldst-zero.ll
index 95b92ac..7d443a6 100644
--- a/llvm/test/CodeGen/AArch64/ldst-zero.ll
+++ b/llvm/test/CodeGen/AArch64/ldst-zero.ll
@@ -9,9 +9,9 @@ declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
 ; Original test case which exhibited the bug
 define void @test1(%struct.tree_common* %t, i32 %code, i8* %type) {
 ; CHECK-LABEL: test1:
-; CHECK: stp xzr, xzr, [x0, #8]
-; CHECK: stp xzr, x2, [x0]
-; CHECK: str w1, [x0, #16]
+; CHECK-DAG: stp x2, xzr, [x0, #8]
+; CHECK-DAG: str w1, [x0, #16]
+; CHECK-DAG: str xzr, [x0]
 entry:
   %0 = bitcast %struct.tree_common* %t to i8*
   tail call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 24, i32 8, i1 false)
@@ -25,10 +25,8 @@ entry:
 ; Store to each struct element instead of using memset
 define void @test2(%struct.tree_common* %t, i32 %code, i8* %type) {
 ; CHECK-LABEL: test2:
-; CHECK: stp xzr, xzr, [x0]
-; CHECK: str wzr, [x0, #16]
-; CHECK: str w1, [x0, #16]
-; CHECK: str x2, [x0, #8]
+; CHECK-DAG: str w1, [x0, #16]
+; CHECK-DAG: stp xzr, x2, [x0]
 entry:
   %0 = getelementptr inbounds %struct.tree_common, %struct.tree_common* %t, i64 0, i32 0
   %1 = getelementptr inbounds %struct.tree_common, %struct.tree_common* %t, i64 0, i32 1
@@ -44,9 +42,9 @@ entry:
 ; Vector store instead of memset
 define void @test3(%struct.tree_common* %t, i32 %code, i8* %type) {
 ; CHECK-LABEL: test3:
-; CHECK: stp xzr, xzr, [x0, #8]
-; CHECK: stp xzr, x2, [x0]
-; CHECK: str w1, [x0, #16]
+; CHECK-DAG: stp x2, xzr, [x0, #8]
+; CHECK-DAG: str w1, [x0, #16]
+; CHECK-DAG: str xzr, [x0]
 entry:
   %0 = bitcast %struct.tree_common* %t to <3 x i64>*
   store <3 x i64> zeroinitializer, <3 x i64>* %0, align 8
@@ -60,9 +58,8 @@ entry:
 ; Vector store, then store to vector elements
 define void @test4(<3 x i64>* %p, i64 %x, i64 %y) {
 ; CHECK-LABEL: test4:
-; CHECK: stp xzr, xzr, [x0, #8]
-; CHECK: stp xzr, x2, [x0]
-; CHECK: str x1, [x0, #16]
+; CHECK-DAG: stp x2, x1, [x0, #8]
+; CHECK-DAG: str xzr, [x0]
 entry:
   store <3 x i64> zeroinitializer, <3 x i64>* %p, align 8
   %0 = bitcast <3 x i64>* %p to i64*
diff --git a/llvm/test/CodeGen/AArch64/misched-stp.ll b/llvm/test/CodeGen/AArch64/misched-stp.ll
index 4ea481c..1c9ea68 100644
--- a/llvm/test/CodeGen/AArch64/misched-stp.ll
+++ b/llvm/test/CodeGen/AArch64/misched-stp.ll
@@ -1,20 +1,18 @@
 ; REQUIRES: asserts
-; RUN: llc < %s -mtriple=aarch64 -mcpu=cyclone -mattr=+use-aa -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 -mcpu=cyclone -mattr=+use-aa -enable-misched -verify-misched -o - | FileCheck %s
 
 ; Tests to check that the scheduler dependencies derived from alias analysis are
 ; correct when we have loads that have been split up so that they can later be
 ; merged into STP.
 
-; CHECK: ********** MI Scheduling **********
-; CHECK: test_splat:BB#0 entry
-; CHECK: SU({{[0-9]+}}):   STRWui %vreg{{[0-9]+}}, %vreg{{[0-9]+}}, 3; mem:ST4[%3+8]
-; CHECK: Successors:
-; CHECK-NEXT: ord  [[SU1:SU\([0-9]+\)]]
-; CHECK: SU({{[0-9]+}}):   STRWui %vreg{{[0-9]+}}, %vreg{{[0-9]+}}, 2; mem:ST4[%3+4]
-; CHECK: Successors:
-; CHECK-NEXT: ord  [[SU2:SU\([0-9]+\)]]
-; CHECK: [[SU1]]:   STRWui %vreg{{[0-9]+}}, %vreg{{[0-9]+}}, 3; mem:ST4[%2]
-; CHECK: [[SU2]]:   STRWui %vreg{{[0-9]+}}, %vreg{{[0-9]+}}, 2; mem:ST4[%1]
+; Now that overwritten stores are elided in SelectionDAG, dependencies
+; are resolved and removed before MISCHED. Check that we have
+; equivalent pair of stp calls as a baseline.
+
+; CHECK-LABEL: test_splat
+; CHECK:     ldr [[REG:w[0-9]+]], [x2]
+; CHECK-DAG: stp w0, [[REG]], [x2, #12]
+; CHECK-DAG: stp [[REG]], w1, [x2, #4]
 define void @test_splat(i32 %x, i32 %y, i32* %p) {
 entry:
   %val = load i32, i32* %p, align 4
@@ -35,16 +33,11 @@ entry:
 declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
 %struct.tree_common = type { i8*, i8*, i32 }
 
-; CHECK: ********** MI Scheduling **********
-; CHECK: test_zero:BB#0 entry
-; CHECK: SU({{[0-9]+}}):   STRXui %XZR, %vreg{{[0-9]+}}, 2; mem:ST8[%0+16]
-; CHECK: Successors:
-; CHECK-NEXT: ord  [[SU3:SU\([0-9]+\)]]
-; CHECK: SU({{[0-9]+}}):   STRXui %XZR, %vreg{{[0-9]+}}, 1; mem:ST8[%0+8]
-; CHECK: Successors:
-; CHECK-NEXT: ord  [[SU4:SU\([0-9]+\)]]
-; CHECK: [[SU3]]:   STRWui %vreg{{[0-9]+}}, %vreg{{[0-9]+}}, 4; mem:ST4[%code1]
-; CHECK: [[SU4]]:   STRXui %vreg{{[0-9]+}}, %vreg{{[0-9]+}}, 1; mem:ST8[%type2]
+; CHECK-LABEL: test_zero
+; CHECK-DAG: stp x2, xzr, [x0, #8]
+; CHECK-DAG: str w1, [x0, #16]
+; CHECK-DAG: str xzr, [x0]
+
 define void @test_zero(%struct.tree_common* %t, i32 %code, i8* %type) {
 entry:
   %0 = bitcast %struct.tree_common* %t to i8*
diff --git a/llvm/test/CodeGen/AMDGPU/global-constant.ll b/llvm/test/CodeGen/AMDGPU/global-constant.ll
index 80acfcc..1898c8f 100644
--- a/llvm/test/CodeGen/AMDGPU/global-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-constant.ll
@@ -29,10 +29,10 @@
 define amdgpu_kernel void @private_test(i32 %index, float addrspace(1)* %out) {
   %ptr = getelementptr [4 x float], [4 x float] addrspace(2) * @private1, i32 0, i32 %index
   %val = load float, float addrspace(2)* %ptr
-  store float %val, float addrspace(1)* %out
+  store volatile float %val, float addrspace(1)* %out
   %ptr2 = getelementptr [4 x float], [4 x float] addrspace(2) * @private2, i32 0, i32 %index
   %val2 = load float, float addrspace(2)* %ptr2
-  store float %val2, float addrspace(1)* %out
+  store volatile float %val2, float addrspace(1)* %out
   ret void
 }
 
diff --git a/llvm/test/CodeGen/ARM/2011-02-04-AntidepMultidef.ll b/llvm/test/CodeGen/ARM/2011-02-04-AntidepMultidef.ll
index 8403dd9..777eccb 100644
--- a/llvm/test/CodeGen/ARM/2011-02-04-AntidepMultidef.ll
+++ b/llvm/test/CodeGen/ARM/2011-02-04-AntidepMultidef.ll
@@ -20,7 +20,7 @@ bb3:                                              ; preds = %bb, %entry
 
 bb8:                                              ; preds = %bb3
   %1 = getelementptr inbounds i8, i8* %0, i32 0
-  store i8 0, i8* %1, align 1
+  store volatile i8 0, i8* %1, align 1
   %2 = call i32 @ptou() nounwind
   ; CHECK: umull [[REGISTER:lr|r[0-9]+]],
   ; CHECK-NOT: [[REGISTER]],
@@ -35,7 +35,7 @@ bb8:                                              ; preds = %bb3
   %7 = or i8 %6, 48
   %8 = add i8 %6, 87
   %iftmp.5.0.1 = select i1 %5, i8 %7, i8 %8
-  store i8 %iftmp.5.0.1, i8* %p8, align 1
+  store volatile i8 %iftmp.5.0.1, i8* %p8, align 1
   ; CHECK: umull [[REGISTER:lr|r[0-9]+]],
   ; CHECK-NOT: [[REGISTER]],
   ; CHECK: {{lr|r[0-9]+}}, {{lr|r[0-9]+$}}
@@ -49,7 +49,7 @@ bb8:                                              ; preds = %bb3
   %13 = or i8 %12, 48
   %14 = add i8 %12, 87
   %iftmp.5.0.2 = select i1 %11, i8 %13, i8 %14
-  store i8 %iftmp.5.0.2, i8* %p8, align 1
+  store volatile i8 %iftmp.5.0.2, i8* %p8, align 1
   ; CHECK: umull [[REGISTER:lr|r[0-9]+]],
   ; CHECK-NOT: [[REGISTER]],
   ; CHECK: {{lr|r[0-9]+}}, {{lr|r[0-9]+$}}
@@ -63,7 +63,7 @@ bb8:                                              ; preds = %bb3
   %19 = or i8 %18, 48
   %20 = add i8 %18, 87
   %iftmp.5.0.4 = select i1 %17, i8 %19, i8 %20
-  store i8 %iftmp.5.0.4, i8* null, align 1
+  store volatile i8 %iftmp.5.0.4, i8* null, align 1
   ; CHECK: umull [[REGISTER:lr|r[0-9]+]],
   ; CHECK-NOT: [[REGISTER]],
   ; CHECK: {{lr|r[0-9]+}}, {{lr|r[0-9]+$}}
@@ -74,7 +74,7 @@ bb8:                                              ; preds = %bb3
   %22 = urem i32 %21, 10
   %23 = icmp ult i32 %22, 10
   %iftmp.5.0.5 = select i1 %23, i8 0, i8 %val8
-  store i8 %iftmp.5.0.5, i8* %p8, align 1
+  store volatile i8 %iftmp.5.0.5, i8* %p8, align 1
   ; CHECK: umull [[REGISTER:lr|r[0-9]+]],
   ; CHECK-NOT: [[REGISTER]],
   ; CHECK: {{lr|r[0-9]+}}, {{lr|r[0-9]+$}}
@@ -88,7 +88,7 @@ bb8:                                              ; preds = %bb3
   %28 = or i8 %27, 48
   %29 = add i8 %27, 87
   %iftmp.5.0.6 = select i1 %26, i8 %28, i8 %29
-  store i8 %iftmp.5.0.6, i8* %p8, align 1
+  store volatile i8 %iftmp.5.0.6, i8* %p8, align 1
   ; CHECK: umull [[REGISTER:lr|r[0-9]+]],
   ; CHECK-NOT: [[REGISTER]],
   ; CHECK: {{lr|r[0-9]+}}, {{lr|r[0-9]+$}}
@@ -102,7 +102,7 @@ bb8:                                              ; preds = %bb3
   %34 = or i8 %33, 48
   %35 = add i8 %33, 87
   %iftmp.5.0.7 = select i1 %32, i8 %34, i8 %35
-  store i8 %iftmp.5.0.7, i8* %p8, align 1
+  store volatile i8 %iftmp.5.0.7, i8* %p8, align 1
   ; CHECK: umull [[REGISTER:lr|r[0-9]+]],
   ; CHECK-NOT: [[REGISTER]],
   ; CHECK: {{lr|r[0-9]+}}, {{lr|r[0-9]+$}}
@@ -116,7 +116,7 @@ bb8:                                              ; preds = %bb3
   %40 = or i8 %39, 48
   %41 = add i8 %39, 87
   %iftmp.5.0.8 = select i1 %38, i8 %40, i8 %41
-  store i8 %iftmp.5.0.8, i8* null, align 1
+  store volatile i8 %iftmp.5.0.8, i8* null, align 1
   br label %bb46
 
 bb46:                                             ; preds = %bb3
diff --git a/llvm/test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll b/llvm/test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll
index 2a5af61..9548602 100644
--- a/llvm/test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll
+++ b/llvm/test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll
@@ -13,7 +13,7 @@ entry:
 ; CHECK: sub       sp, sp, #12
 ; CHECK: sub       sp, sp, #4
 ; CHECK: add       r0, sp, #4
-; CHECK: stm       sp, {r0, r1, r2, r3}
+; CHECK: stmib     sp, {r1, r2, r3}
   %g = alloca i8*
   %g1 = bitcast i8** %g to i8*
   call void @llvm.va_start(i8* %g1)
diff --git a/llvm/test/CodeGen/ARM/dag-combine-ldst.ll b/llvm/test/CodeGen/ARM/dag-combine-ldst.ll
index c1960ee..077754e 100644
--- a/llvm/test/CodeGen/ARM/dag-combine-ldst.ll
+++ b/llvm/test/CodeGen/ARM/dag-combine-ldst.ll
@@ -8,7 +8,7 @@
 ; CHECK-LABEL:   {{^}}main
 ; CHECK:         mov [[TMP:r[0-9]+]], #0
 ; CHECK-NEXT:    str [[TMP]], [sp, #4]
-; CHECK-NEXT:    str [[TMP]], [sp]
+; CHECK_O0:      str [[TMP]], [sp]
 ; CHECK_O0:      ldr [[TMP:r[0-9]+]], [sp]
 ; CHECK_O0-NEXT: add [[TMP]], [[TMP]], #2
 ; CHECK_O1-NOT:  ldr [[TMP:r[0-9]+]], [sp]
diff --git a/llvm/test/CodeGen/MSP430/vararg.ll b/llvm/test/CodeGen/MSP430/vararg.ll
index 6c8bceff..a708b89 100644
--- a/llvm/test/CodeGen/MSP430/vararg.ll
+++ b/llvm/test/CodeGen/MSP430/vararg.ll
@@ -25,7 +25,6 @@ define i16 @va_arg(i8* %vl) nounwind {
 entry:
 ; CHECK-LABEL: va_arg:
   %vl.addr = alloca i8*, align 2
-; CHECK: mov.w r12, 0(r1)
   store i8* %vl, i8** %vl.addr, align 2
 ; CHECK: mov.w r12, [[REG:r[0-9]+]]
 ; CHECK-NEXT: add.w #2, [[REG]]
diff --git a/llvm/test/CodeGen/Mips/msa/bmzi_bmnzi.ll b/llvm/test/CodeGen/Mips/msa/bmzi_bmnzi.ll
index d1cb3c3..de62dcd 100644
--- a/llvm/test/CodeGen/Mips/msa/bmzi_bmnzi.ll
+++ b/llvm/test/CodeGen/Mips/msa/bmzi_bmnzi.ll
@@ -9,9 +9,9 @@ entry:
   %0 = load <16 x i8>, <16 x i8>* @llvm_mips_bmnzi_b_ARG1
   %1 = load <16 x i8>, <16 x i8>* @llvm_mips_bmnzi_b_ARG2
   %2 = tail call <16 x i8> @llvm.mips.bmnzi.b(<16 x i8> %0, <16 x i8> %1, i32 240)
-  store <16 x i8> %2, <16 x i8>* @llvm_mips_bmnzi_b_RES
+  store volatile <16 x i8> %2, <16 x i8>* @llvm_mips_bmnzi_b_RES
   %3 = tail call <16 x i8> @llvm.mips.bmnzi.b(<16 x i8> %0, <16 x i8> %1, i32 15)
-  store <16 x i8> %3, <16 x i8>* @llvm_mips_bmnzi_b_RES
+  store volatile <16 x i8> %3, <16 x i8>* @llvm_mips_bmnzi_b_RES
   %4 = tail call <16 x i8> @llvm.mips.bmnzi.b(<16 x i8> %0, <16 x i8> %1, i32 170)
   store <16 x i8> %4, <16 x i8>* @llvm_mips_bmnzi_b_RES
   ret void
@@ -32,9 +32,9 @@ entry:
   %0 = load <16 x i8>, <16 x i8>* @llvm_mips_bmnzi_b_ARG1
   %1 = load <16 x i8>, <16 x i8>* @llvm_mips_bmnzi_b_ARG2
   %2 = tail call <16 x i8> @llvm.mips.bmzi.b(<16 x i8> %0, <16 x i8> %1, i32 240)
-  store <16 x i8> %2, <16 x i8>* @llvm_mips_bmnzi_b_RES
+  store volatile <16 x i8> %2, <16 x i8>* @llvm_mips_bmnzi_b_RES
   %3 = tail call <16 x i8> @llvm.mips.bmzi.b(<16 x i8> %0, <16 x i8> %1, i32 15)
-  store <16 x i8> %3, <16 x i8>* @llvm_mips_bmnzi_b_RES
+  store volatile <16 x i8> %3, <16 x i8>* @llvm_mips_bmnzi_b_RES
   %4 = tail call <16 x i8> @llvm.mips.bmzi.b(<16 x i8> %0, <16 x i8> %1, i32 170)
   store <16 x i8> %4, <16 x i8>* @llvm_mips_bmnzi_b_RES
   ret void
diff --git a/llvm/test/CodeGen/PowerPC/ppcf128sf.ll b/llvm/test/CodeGen/PowerPC/ppcf128sf.ll
index 6804b55..fde7d48 100644
--- a/llvm/test/CodeGen/PowerPC/ppcf128sf.ll
+++ b/llvm/test/CodeGen/PowerPC/ppcf128sf.ll
@@ -14,19 +14,19 @@ entry:
   %0 = load ppc_fp128, ppc_fp128* @ld, align 16
   %1 = load ppc_fp128, ppc_fp128* @ld2, align 16
   %add = fadd ppc_fp128 %0, %1
-  store ppc_fp128 %add, ppc_fp128* %c, align 16
+  store volatile ppc_fp128 %add, ppc_fp128* %c, align 16
   %2 = load ppc_fp128, ppc_fp128* @ld, align 16
   %3 = load ppc_fp128, ppc_fp128* @ld2, align 16
   %sub = fsub ppc_fp128 %2, %3
-  store ppc_fp128 %sub, ppc_fp128* %c, align 16
+  store volatile ppc_fp128 %sub, ppc_fp128* %c, align 16
   %4 = load ppc_fp128, ppc_fp128* @ld, align 16
   %5 = load ppc_fp128, ppc_fp128* @ld2, align 16
   %mul = fmul ppc_fp128 %4, %5
-  store ppc_fp128 %mul, ppc_fp128* %c, align 16
+  store volatile ppc_fp128 %mul, ppc_fp128* %c, align 16
   %6 = load ppc_fp128, ppc_fp128* @ld, align 16
   %7 = load ppc_fp128, ppc_fp128* @ld2, align 16
   %div = fdiv ppc_fp128 %6, %7
-  store ppc_fp128 %div, ppc_fp128* %c, align 16
+  store volatile ppc_fp128 %div, ppc_fp128* %c, align 16
   ret void
 
   ; CHECK-LABEL:    __gcc_qadd
diff --git a/llvm/test/CodeGen/SPARC/32abi.ll b/llvm/test/CodeGen/SPARC/32abi.ll
index 09e7a3a..3807f84 100644
--- a/llvm/test/CodeGen/SPARC/32abi.ll
+++ b/llvm/test/CodeGen/SPARC/32abi.ll
@@ -25,17 +25,17 @@ define void @intarg(i8  %a0,   ; %i0
                     i32 %a5,   ; %i5
                     i32 signext %a6,   ; [%fp+92]
                     i8* %a7) { ; [%fp+96]
-  store i8 %a0, i8* %a4
-  store i8 %a1, i8* %a4
+  store volatile i8 %a0, i8* %a4
+  store volatile i8 %a1, i8* %a4
   %p16 = bitcast i8* %a4 to i16*
-  store i16 %a2, i16* %p16
+  store volatile i16 %a2, i16* %p16
   %p32 = bitcast i8* %a4 to i32*
-  store i32 %a3, i32* %p32
+  store volatile i32 %a3, i32* %p32
   %pp = bitcast i8* %a4 to i8**
-  store i8* %a4, i8** %pp
-  store i32 %a5, i32* %p32
-  store i32 %a6, i32* %p32
-  store i8* %a7, i8** %pp
+  store volatile i8* %a4, i8** %pp
+  store volatile i32 %a5, i32* %p32
+  store volatile i32 %a6, i32* %p32
+  store volatile i8* %a7, i8** %pp
   ret void
 }
 
diff --git a/llvm/test/CodeGen/SPARC/64abi.ll b/llvm/test/CodeGen/SPARC/64abi.ll
index b963be2..771cc40 100644
--- a/llvm/test/CodeGen/SPARC/64abi.ll
+++ b/llvm/test/CodeGen/SPARC/64abi.ll
@@ -24,17 +24,17 @@ define void @intarg(i8  %a0,   ; %i0
                     i32 %a5,   ; %i5
                     i32 signext %a6,   ; [%fp+BIAS+176]
                     i8* %a7) { ; [%fp+BIAS+184]
-  store i8 %a0, i8* %a4
-  store i8 %a1, i8* %a4
+  store volatile i8 %a0, i8* %a4
+  store volatile i8 %a1, i8* %a4
   %p16 = bitcast i8* %a4 to i16*
-  store i16 %a2, i16* %p16
+  store volatile i16 %a2, i16* %p16
   %p32 = bitcast i8* %a4 to i32*
-  store i32 %a3, i32* %p32
+  store volatile i32 %a3, i32* %p32
   %pp = bitcast i8* %a4 to i8**
-  store i8* %a4, i8** %pp
-  store i32 %a5, i32* %p32
-  store i32 %a6, i32* %p32
-  store i8* %a7, i8** %pp
+  store volatile i8* %a4, i8** %pp
+  store volatile i32 %a5, i32* %p32
+  store volatile i32 %a6, i32* %p32
+  store volatile i8* %a7, i8** %pp
   ret void
 }
 
@@ -316,7 +316,7 @@ define void @call_ret_i64_pair(i64* %i0) {
   %rv = call { i64, i64 } @ret_i64_pair(i32 undef, i32 undef,
                                         i64* undef, i64* undef)
   %e0 = extractvalue { i64, i64 } %rv, 0
-  store i64 %e0, i64* %i0
+  store volatile i64 %e0, i64* %i0
   %e1 = extractvalue { i64, i64 } %rv, 1
   store i64 %e1, i64* %i0
   ret void
diff --git a/llvm/test/CodeGen/SystemZ/swift-return.ll b/llvm/test/CodeGen/SystemZ/swift-return.ll
index 69d0e97..977816f 100644
--- a/llvm/test/CodeGen/SystemZ/swift-return.ll
+++ b/llvm/test/CodeGen/SystemZ/swift-return.ll
@@ -189,11 +189,11 @@ define void @consume_i1_ret() {
   %v6 = extractvalue { i1, i1, i1, i1 } %call, 2
   %v7 = extractvalue { i1, i1, i1, i1 } %call, 3
   %val = zext i1 %v3 to i32
-  store i32 %val, i32* @var
+  store volatile i32 %val, i32* @var
   %val2 = zext i1 %v5 to i32
-  store i32 %val2, i32* @var
+  store volatile i32 %val2, i32* @var
   %val3 = zext i1 %v6 to i32
-  store i32 %val3, i32* @var
+  store volatile i32 %val3, i32* @var
   %val4 = zext i1 %v7 to i32
   store i32 %val4, i32* @var
   ret void
diff --git a/llvm/test/CodeGen/Thumb/stack-access.ll b/llvm/test/CodeGen/Thumb/stack-access.ll
index 44217ab..533559a 100644
--- a/llvm/test/CodeGen/Thumb/stack-access.ll
+++ b/llvm/test/CodeGen/Thumb/stack-access.ll
@@ -7,13 +7,13 @@ define void @test1(i8** %p) {
   %z = alloca i8, align 1
 ; CHECK: add r1, sp, #8
 ; CHECK: str r1, [r0]
-  store i8* %x, i8** %p, align 4
+  store volatile i8* %x, i8** %p, align 4
 ; CHECK: add r1, sp, #4
 ; CHECK: str r1, [r0]
-  store i8* %y, i8** %p, align 4
+  store volatile i8* %y, i8** %p, align 4
 ; CHECK: mov r1, sp
 ; CHECK: str r1, [r0]
-  store i8* %z, i8** %p, align 4
+  store volatile i8* %z, i8** %p, align 4
   ret void
 }
 
@@ -24,10 +24,10 @@ define void @test2([1024 x i8]** %p) {
 ; CHECK: add r1, sp, #1020
 ; CHECK: adds r1, #4
 ; CHECK: str r1, [r0]
-  store [1024 x i8]* %arr1, [1024 x i8]** %p, align 4
+  store volatile [1024 x i8]* %arr1, [1024 x i8]** %p, align 4
 ; CHECK: mov r1, sp
 ; CHECK: str r1, [r0]
-  store [1024 x i8]* %arr2, [1024 x i8]** %p, align 4
+  store volatile [1024 x i8]* %arr2, [1024 x i8]** %p, align 4
   ret void
 }
 
diff --git a/llvm/test/CodeGen/Thumb2/ldr-str-imm12.ll b/llvm/test/CodeGen/Thumb2/ldr-str-imm12.ll
index 3e4bd02..c6d00d4 100644
--- a/llvm/test/CodeGen/Thumb2/ldr-str-imm12.ll
+++ b/llvm/test/CodeGen/Thumb2/ldr-str-imm12.ll
@@ -50,9 +50,9 @@ bb420:                                            ; preds = %bb20, %bb20
 ; CHECK: str{{(.w)?}} r{{[0-9]+}}, [sp
 ; CHECK: str{{(.w)?}} r{{[0-9]+}}, [sp
 ; CHECK: str{{(.w)?}} r{{[0-9]+}}, [sp
-  store %union.rec* null, %union.rec** @zz_hold, align 4
+  store volatile %union.rec* null, %union.rec** @zz_hold, align 4
   store %union.rec* null, %union.rec** @zz_res, align 4
-  store %union.rec* %x, %union.rec** @zz_hold, align 4
+  store volatile %union.rec* %x, %union.rec** @zz_hold, align 4
   %0 = call  %union.rec* @Manifest(%union.rec* undef, %union.rec* %env, %struct.STYLE* %style, %union.rec** %bthr, %union.rec** %fthr, %union.rec** %target, %union.rec** %crs, i32 %ok, i32 %need_expand, %union.rec** %enclose, i32 %fcr) nounwind ; <%union.rec*> [#uses=0]
   unreachable
 
diff --git a/llvm/test/CodeGen/X86/arg-copy-elide.ll b/llvm/test/CodeGen/X86/arg-copy-elide.ll
index b9a2eee..126f5a1 100644
--- a/llvm/test/CodeGen/X86/arg-copy-elide.ll
+++ b/llvm/test/CodeGen/X86/arg-copy-elide.ll
@@ -253,9 +253,7 @@ entry:
 ; CHECK: calll _addrof_i32
 ; CHECK: retl
 
-
 ; Don't elide the copy when the alloca is escaped with a store.
-
 define void @escape_with_store(i32 %x) {
   %x1 = alloca i32
   %x2 = alloca i32*
@@ -268,9 +266,8 @@ define void @escape_with_store(i32 %x) {
 }
 
 ; CHECK-LABEL: _escape_with_store:
-; CHECK-DAG: movl {{.*}}(%esp), %[[reg:[^ ]*]]
-; CHECK-DAG: movl $0, [[offs:[0-9]*]](%esp)
-; CHECK: movl %[[reg]], [[offs]](%esp)
+; CHECK: movl {{.*}}(%esp), %[[reg:[^ ]*]]
+; CHECK: movl %[[reg]], [[offs:[0-9]*]](%esp)
 ; CHECK: calll _addrof_i32
 
 
diff --git a/llvm/test/CodeGen/X86/nontemporal.ll b/llvm/test/CodeGen/X86/nontemporal.ll
index 33d5cab..d49c887 100644
--- a/llvm/test/CodeGen/X86/nontemporal.ll
+++ b/llvm/test/CodeGen/X86/nontemporal.ll
@@ -9,33 +9,29 @@ define void @f(<4 x float> %A, i8* %B, <2 x double> %C, i32 %D, <2 x i64> %E, <4
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    pushl %ebp
 ; X32-SSE-NEXT:    movl %esp, %ebp
-; X32-SSE-NEXT:    pushl %esi
 ; X32-SSE-NEXT:    andl $-16, %esp
 ; X32-SSE-NEXT:    subl $16, %esp
 ; X32-SSE-NEXT:    movl 72(%ebp), %eax
 ; X32-SSE-NEXT:    movl 76(%ebp), %ecx
-; X32-SSE-NEXT:    movl 12(%ebp), %edx
 ; X32-SSE-NEXT:    movdqa 56(%ebp), %xmm3
 ; X32-SSE-NEXT:    movdqa 40(%ebp), %xmm4
 ; X32-SSE-NEXT:    movdqa 24(%ebp), %xmm5
-; X32-SSE-NEXT:    movl 8(%ebp), %esi
-; X32-SSE-NEXT:    addps .LCPI0_0, %xmm0
-; X32-SSE-NEXT:    movntps %xmm0, (%esi)
-; X32-SSE-NEXT:    paddq .LCPI0_1, %xmm2
-; X32-SSE-NEXT:    movntdq %xmm2, (%esi)
-; X32-SSE-NEXT:    addpd .LCPI0_2, %xmm1
-; X32-SSE-NEXT:    movntpd %xmm1, (%esi)
-; X32-SSE-NEXT:    paddd .LCPI0_3, %xmm5
-; X32-SSE-NEXT:    movntdq %xmm5, (%esi)
-; X32-SSE-NEXT:    paddw .LCPI0_4, %xmm4
-; X32-SSE-NEXT:    movntdq %xmm4, (%esi)
-; X32-SSE-NEXT:    paddb .LCPI0_5, %xmm3
-; X32-SSE-NEXT:    movntdq %xmm3, (%esi)
-; X32-SSE-NEXT:    movntil %edx, (%esi)
-; X32-SSE-NEXT:    movntil %ecx, 4(%esi)
-; X32-SSE-NEXT:    movntil %eax, (%esi)
-; X32-SSE-NEXT:    leal -4(%ebp), %esp
-; X32-SSE-NEXT:    popl %esi
+; X32-SSE-NEXT:    movl 8(%ebp), %edx
+; X32-SSE-NEXT:    addps {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT:    movntps %xmm0, (%edx)
+; X32-SSE-NEXT:    paddq {{\.LCPI.*}}, %xmm2
+; X32-SSE-NEXT:    movntdq %xmm2, (%edx)
+; X32-SSE-NEXT:    addpd {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT:    movntpd %xmm1, (%edx)
+; X32-SSE-NEXT:    paddd {{\.LCPI.*}}, %xmm5
+; X32-SSE-NEXT:    movntdq %xmm5, (%edx)
+; X32-SSE-NEXT:    paddw {{\.LCPI.*}}, %xmm4
+; X32-SSE-NEXT:    movntdq %xmm4, (%edx)
+; X32-SSE-NEXT:    paddb {{\.LCPI.*}}, %xmm3
+; X32-SSE-NEXT:    movntdq %xmm3, (%edx)
+; X32-SSE-NEXT:    movntil %ecx, 4(%edx)
+; X32-SSE-NEXT:    movntil %eax, (%edx)
+; X32-SSE-NEXT:    movl %ebp, %esp
 ; X32-SSE-NEXT:    popl %ebp
 ; X32-SSE-NEXT:    retl
 ;
@@ -43,33 +39,29 @@ define void @f(<4 x float> %A, i8* %B, <2 x double> %C, i32 %D, <2 x i64> %E, <4
 ; X32-AVX:       # BB#0:
 ; X32-AVX-NEXT:    pushl %ebp
 ; X32-AVX-NEXT:    movl %esp, %ebp
-; X32-AVX-NEXT:    pushl %esi
 ; X32-AVX-NEXT:    andl $-16, %esp
 ; X32-AVX-NEXT:    subl $16, %esp
 ; X32-AVX-NEXT:    movl 72(%ebp), %eax
 ; X32-AVX-NEXT:    movl 76(%ebp), %ecx
-; X32-AVX-NEXT:    movl 12(%ebp), %edx
 ; X32-AVX-NEXT:    vmovdqa 56(%ebp), %xmm3
 ; X32-AVX-NEXT:    vmovdqa 40(%ebp), %xmm4
 ; X32-AVX-NEXT:    vmovdqa 24(%ebp), %xmm5
-; X32-AVX-NEXT:    movl 8(%ebp), %esi
-; X32-AVX-NEXT:    vaddps .LCPI0_0, %xmm0, %xmm0
-; X32-AVX-NEXT:    vmovntps %xmm0, (%esi)
-; X32-AVX-NEXT:    vpaddq .LCPI0_1, %xmm2, %xmm0
-; X32-AVX-NEXT:    vmovntdq %xmm0, (%esi)
-; X32-AVX-NEXT:    vaddpd .LCPI0_2, %xmm1, %xmm0
-; X32-AVX-NEXT:    vmovntpd %xmm0, (%esi)
-; X32-AVX-NEXT:    vpaddd .LCPI0_3, %xmm5, %xmm0
-; X32-AVX-NEXT:    vmovntdq %xmm0, (%esi)
-; X32-AVX-NEXT:    vpaddw .LCPI0_4, %xmm4, %xmm0
-; X32-AVX-NEXT:    vmovntdq %xmm0, (%esi)
-; X32-AVX-NEXT:    vpaddb .LCPI0_5, %xmm3, %xmm0
-; X32-AVX-NEXT:    vmovntdq %xmm0, (%esi)
-; X32-AVX-NEXT:    movntil %edx, (%esi)
-; X32-AVX-NEXT:    movntil %ecx, 4(%esi)
-; X32-AVX-NEXT:    movntil %eax, (%esi)
-; X32-AVX-NEXT:    leal -4(%ebp), %esp
-; X32-AVX-NEXT:    popl %esi
+; X32-AVX-NEXT:    movl 8(%ebp), %edx
+; X32-AVX-NEXT:    vaddps {{\.LCPI.*}}, %xmm0, %xmm0
+; X32-AVX-NEXT:    vmovntps %xmm0, (%edx)
+; X32-AVX-NEXT:    vpaddq {{\.LCPI.*}}, %xmm2, %xmm0
+; X32-AVX-NEXT:    vmovntdq %xmm0, (%edx)
+; X32-AVX-NEXT:    vaddpd {{\.LCPI.*}}, %xmm1, %xmm0
+; X32-AVX-NEXT:    vmovntpd %xmm0, (%edx)
+; X32-AVX-NEXT:    vpaddd {{\.LCPI.*}}, %xmm5, %xmm0
+; X32-AVX-NEXT:    vmovntdq %xmm0, (%edx)
+; X32-AVX-NEXT:    vpaddw {{\.LCPI.*}}, %xmm4, %xmm0
+; X32-AVX-NEXT:    vmovntdq %xmm0, (%edx)
+; X32-AVX-NEXT:    vpaddb {{\.LCPI.*}}, %xmm3, %xmm0
+; X32-AVX-NEXT:    vmovntdq %xmm0, (%edx)
+; X32-AVX-NEXT:    movntil %ecx, 4(%edx)
+; X32-AVX-NEXT:    movntil %eax, (%edx)
+; X32-AVX-NEXT:    movl %ebp, %esp
 ; X32-AVX-NEXT:    popl %ebp
 ; X32-AVX-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/store-narrow.ll b/llvm/test/CodeGen/X86/store-narrow.ll
index 16f152d..5e9e1e3 100644
--- a/llvm/test/CodeGen/X86/store-narrow.ll
+++ b/llvm/test/CodeGen/X86/store-narrow.ll
@@ -134,10 +134,7 @@ entry:
 @g_16 = internal global i32 -1
 
 ; X64-LABEL: test8:
-; X64-NEXT: movl _g_16(%rip), %eax
-; X64-NEXT: movl $0, _g_16(%rip)
-; X64-NEXT: orl  $1, %eax
-; X64-NEXT: movl %eax, _g_16(%rip)
+; X64-NEXT: orb  $1, _g_16(%rip)
 ; X64-NEXT: ret
 define void @test8() nounwind {
   %tmp = load i32, i32* @g_16
diff --git a/llvm/test/CodeGen/X86/swift-return.ll b/llvm/test/CodeGen/X86/swift-return.ll
index 60e33e6..0ea176d 100644
--- a/llvm/test/CodeGen/X86/swift-return.ll
+++ b/llvm/test/CodeGen/X86/swift-return.ll
@@ -184,11 +184,11 @@ define void @consume_i1_ret() {
   %v6 = extractvalue { i1, i1, i1, i1 } %call, 2
   %v7 = extractvalue { i1, i1, i1, i1 } %call, 3
   %val = zext i1 %v3 to i32
-  store i32 %val, i32* @var
+  store volatile i32 %val, i32* @var
   %val2 = zext i1 %v5 to i32
-  store i32 %val2, i32* @var
+  store volatile i32 %val2, i32* @var
   %val3 = zext i1 %v6 to i32
-  store i32 %val3, i32* @var
+  store volatile i32 %val3, i32* @var
   %val4 = zext i1 %v7 to i32
   store i32 %val4, i32* @var
   ret void
diff --git a/llvm/test/CodeGen/X86/win32-spill-xmm.ll b/llvm/test/CodeGen/X86/win32-spill-xmm.ll
index 0db97cf..c6b163b 100644
--- a/llvm/test/CodeGen/X86/win32-spill-xmm.ll
+++ b/llvm/test/CodeGen/X86/win32-spill-xmm.ll
@@ -20,7 +20,7 @@ declare void @bar(<16 x float> %a, i32 %b)
 ; Check that proper alignment of spilled vector does not affect vargs
 
 ; CHECK-LABEL: vargs_not_affected
-; CHECK: leal    28(%ebp), %eax
+; CHECK: movl 28(%ebp), %eax
 define i32 @vargs_not_affected(<4 x float> %v, i8* %f, ...) {
 entry:
   %ap = alloca i8*, align 4
diff --git a/llvm/test/CodeGen/X86/win64_sibcall.ll b/llvm/test/CodeGen/X86/win64_sibcall.ll
index 4bba0e1..42dd4d3 100644
--- a/llvm/test/CodeGen/X86/win64_sibcall.ll
+++ b/llvm/test/CodeGen/X86/win64_sibcall.ll
@@ -12,8 +12,8 @@ entry:
 ; LINUX:	movq	$0, -8(%rsp)
 
   %this = alloca %Object addrspace(1)*
-  store %Object addrspace(1)* null, %Object addrspace(1)** %this
-  store %Object addrspace(1)* %param0, %Object addrspace(1)** %this
+  store volatile %Object addrspace(1)* null, %Object addrspace(1)** %this
+  store volatile %Object addrspace(1)* %param0, %Object addrspace(1)** %this
   br label %0
 
 ; <label>:0                                       ; preds = %entry
diff --git a/llvm/test/CodeGen/X86/win64_vararg.ll b/llvm/test/CodeGen/X86/win64_vararg.ll
index 8d7f201..20386bf 100644
--- a/llvm/test/CodeGen/X86/win64_vararg.ll
+++ b/llvm/test/CodeGen/X86/win64_vararg.ll
@@ -94,9 +94,7 @@ entry:
 
 ; CHECK-LABEL: arg4:
 ; CHECK: pushq
-; va_start:
-; CHECK: leaq 48(%rsp), [[REG_arg4_1:%[a-z]+]]
-; CHECK: movq [[REG_arg4_1]], (%rsp)
+; va_start (optimized away as overwritten by va_arg)
 ; va_arg:
 ; CHECK: leaq 52(%rsp), [[REG_arg4_2:%[a-z]+]]
 ; CHECK: movq [[REG_arg4_2]], (%rsp)
diff --git a/llvm/test/CodeGen/X86/x86-64-ms_abi-vararg.ll b/llvm/test/CodeGen/X86/x86-64-ms_abi-vararg.ll
index e343652..299190e 100644
--- a/llvm/test/CodeGen/X86/x86-64-ms_abi-vararg.ll
+++ b/llvm/test/CodeGen/X86/x86-64-ms_abi-vararg.ll
@@ -90,9 +90,7 @@ entry:
 }
 
 ; CHECK-LABEL: arg4:
-; va_start:
-; CHECK: leaq 48(%rsp), [[REG_arg4_1:%[a-z]+]]
-; CHECK: movq [[REG_arg4_1]], (%rsp)
+; va_start (optimized away as overwritten by va_arg)
 ; va_arg:
 ; CHECK: leaq 52(%rsp), [[REG_arg4_2:%[a-z]+]]
 ; CHECK: movq [[REG_arg4_2]], (%rsp)