}
}
- // If this is a store followed by a store with the same value to the same
- // location, then the store is dead/noop.
if (StoreSDNode *ST1 = dyn_cast<StoreSDNode>(Chain)) {
- if (ST1->getBasePtr() == Ptr && ST->getMemoryVT() == ST1->getMemoryVT() &&
- ST1->getValue() == Value && ST->isUnindexed() && !ST->isVolatile() &&
- ST1->isUnindexed() && !ST1->isVolatile()) {
- // The store is dead, remove it.
- return Chain;
+ if (ST->isUnindexed() && !ST->isVolatile() && ST1->isUnindexed() &&
+ !ST1->isVolatile() && ST1->getBasePtr() == Ptr &&
+ ST->getMemoryVT() == ST1->getMemoryVT()) {
+ // If this is a store followed by a store with the same value to the same
+ // location, then the store is dead/noop.
+ if (ST1->getValue() == Value) {
+ // The store is dead, remove it.
+ return Chain;
+ }
+
+ // If this is a store who's preceeding store to the same location
+ // and no one other node is chained to that store we can effectively
+ // drop the store. Do not remove stores to undef as they may be used as
+ // data sinks.
+ if (OptLevel != CodeGenOpt::None && ST1->hasOneUse() &&
+ !ST1->getBasePtr().isUndef()) {
+ // ST1 is fully overwritten and can be elided. Combine with it's chain
+ // value.
+ CombineTo(ST1, ST1->getChain());
+ return SDValue();
+ }
}
}
; Original test case which exhibited the bug
define void @test1(%struct.tree_common* %t, i32 %code, i8* %type) {
; CHECK-LABEL: test1:
-; CHECK: stp xzr, xzr, [x0, #8]
-; CHECK: stp xzr, x2, [x0]
-; CHECK: str w1, [x0, #16]
+; CHECK-DAG: stp x2, xzr, [x0, #8]
+; CHECK-DAG: str w1, [x0, #16]
+; CHECK-DAG: str xzr, [x0]
entry:
%0 = bitcast %struct.tree_common* %t to i8*
tail call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 24, i32 8, i1 false)
; Store to each struct element instead of using memset
define void @test2(%struct.tree_common* %t, i32 %code, i8* %type) {
; CHECK-LABEL: test2:
-; CHECK: stp xzr, xzr, [x0]
-; CHECK: str wzr, [x0, #16]
-; CHECK: str w1, [x0, #16]
-; CHECK: str x2, [x0, #8]
+; CHECK-DAG: str w1, [x0, #16]
+; CHECK-DAG: stp xzr, x2, [x0]
entry:
%0 = getelementptr inbounds %struct.tree_common, %struct.tree_common* %t, i64 0, i32 0
%1 = getelementptr inbounds %struct.tree_common, %struct.tree_common* %t, i64 0, i32 1
; Vector store instead of memset
define void @test3(%struct.tree_common* %t, i32 %code, i8* %type) {
; CHECK-LABEL: test3:
-; CHECK: stp xzr, xzr, [x0, #8]
-; CHECK: stp xzr, x2, [x0]
-; CHECK: str w1, [x0, #16]
+; CHECK-DAG: stp x2, xzr, [x0, #8]
+; CHECK-DAG: str w1, [x0, #16]
+; CHECK-DAG: str xzr, [x0]
entry:
%0 = bitcast %struct.tree_common* %t to <3 x i64>*
store <3 x i64> zeroinitializer, <3 x i64>* %0, align 8
; Vector store, then store to vector elements
define void @test4(<3 x i64>* %p, i64 %x, i64 %y) {
; CHECK-LABEL: test4:
-; CHECK: stp xzr, xzr, [x0, #8]
-; CHECK: stp xzr, x2, [x0]
-; CHECK: str x1, [x0, #16]
+; CHECK-DAG: stp x2, x1, [x0, #8]
+; CHECK-DAG: str xzr, [x0]
entry:
store <3 x i64> zeroinitializer, <3 x i64>* %p, align 8
%0 = bitcast <3 x i64>* %p to i64*
; REQUIRES: asserts
-; RUN: llc < %s -mtriple=aarch64 -mcpu=cyclone -mattr=+use-aa -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 -mcpu=cyclone -mattr=+use-aa -enable-misched -verify-misched -o - | FileCheck %s
; Tests to check that the scheduler dependencies derived from alias analysis are
; correct when we have loads that have been split up so that they can later be
; merged into STP.
-; CHECK: ********** MI Scheduling **********
-; CHECK: test_splat:BB#0 entry
-; CHECK: SU({{[0-9]+}}): STRWui %vreg{{[0-9]+}}, %vreg{{[0-9]+}}, 3; mem:ST4[%3+8]
-; CHECK: Successors:
-; CHECK-NEXT: ord [[SU1:SU\([0-9]+\)]]
-; CHECK: SU({{[0-9]+}}): STRWui %vreg{{[0-9]+}}, %vreg{{[0-9]+}}, 2; mem:ST4[%3+4]
-; CHECK: Successors:
-; CHECK-NEXT: ord [[SU2:SU\([0-9]+\)]]
-; CHECK: [[SU1]]: STRWui %vreg{{[0-9]+}}, %vreg{{[0-9]+}}, 3; mem:ST4[%2]
-; CHECK: [[SU2]]: STRWui %vreg{{[0-9]+}}, %vreg{{[0-9]+}}, 2; mem:ST4[%1]
+; Now that overwritten stores are elided in SelectionDAG, dependencies
+; are resolved and removed before MISCHED. Check that we have
+; equivalent pair of stp calls as a baseline.
+
+; CHECK-LABEL: test_splat
+; CHECK: ldr [[REG:w[0-9]+]], [x2]
+; CHECK-DAG: stp w0, [[REG]], [x2, #12]
+; CHECK-DAG: stp [[REG]], w1, [x2, #4]
define void @test_splat(i32 %x, i32 %y, i32* %p) {
entry:
%val = load i32, i32* %p, align 4
declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
%struct.tree_common = type { i8*, i8*, i32 }
-; CHECK: ********** MI Scheduling **********
-; CHECK: test_zero:BB#0 entry
-; CHECK: SU({{[0-9]+}}): STRXui %XZR, %vreg{{[0-9]+}}, 2; mem:ST8[%0+16]
-; CHECK: Successors:
-; CHECK-NEXT: ord [[SU3:SU\([0-9]+\)]]
-; CHECK: SU({{[0-9]+}}): STRXui %XZR, %vreg{{[0-9]+}}, 1; mem:ST8[%0+8]
-; CHECK: Successors:
-; CHECK-NEXT: ord [[SU4:SU\([0-9]+\)]]
-; CHECK: [[SU3]]: STRWui %vreg{{[0-9]+}}, %vreg{{[0-9]+}}, 4; mem:ST4[%code1]
-; CHECK: [[SU4]]: STRXui %vreg{{[0-9]+}}, %vreg{{[0-9]+}}, 1; mem:ST8[%type2]
+; CHECK-LABEL: test_zero
+; CHECK-DAG: stp x2, xzr, [x0, #8]
+; CHECK-DAG: str w1, [x0, #16]
+; CHECK-DAG: str xzr, [x0]
+
define void @test_zero(%struct.tree_common* %t, i32 %code, i8* %type) {
entry:
%0 = bitcast %struct.tree_common* %t to i8*
define amdgpu_kernel void @private_test(i32 %index, float addrspace(1)* %out) {
%ptr = getelementptr [4 x float], [4 x float] addrspace(2) * @private1, i32 0, i32 %index
%val = load float, float addrspace(2)* %ptr
- store float %val, float addrspace(1)* %out
+ store volatile float %val, float addrspace(1)* %out
%ptr2 = getelementptr [4 x float], [4 x float] addrspace(2) * @private2, i32 0, i32 %index
%val2 = load float, float addrspace(2)* %ptr2
- store float %val2, float addrspace(1)* %out
+ store volatile float %val2, float addrspace(1)* %out
ret void
}
bb8: ; preds = %bb3
%1 = getelementptr inbounds i8, i8* %0, i32 0
- store i8 0, i8* %1, align 1
+ store volatile i8 0, i8* %1, align 1
%2 = call i32 @ptou() nounwind
; CHECK: umull [[REGISTER:lr|r[0-9]+]],
; CHECK-NOT: [[REGISTER]],
%7 = or i8 %6, 48
%8 = add i8 %6, 87
%iftmp.5.0.1 = select i1 %5, i8 %7, i8 %8
- store i8 %iftmp.5.0.1, i8* %p8, align 1
+ store volatile i8 %iftmp.5.0.1, i8* %p8, align 1
; CHECK: umull [[REGISTER:lr|r[0-9]+]],
; CHECK-NOT: [[REGISTER]],
; CHECK: {{lr|r[0-9]+}}, {{lr|r[0-9]+$}}
%13 = or i8 %12, 48
%14 = add i8 %12, 87
%iftmp.5.0.2 = select i1 %11, i8 %13, i8 %14
- store i8 %iftmp.5.0.2, i8* %p8, align 1
+ store volatile i8 %iftmp.5.0.2, i8* %p8, align 1
; CHECK: umull [[REGISTER:lr|r[0-9]+]],
; CHECK-NOT: [[REGISTER]],
; CHECK: {{lr|r[0-9]+}}, {{lr|r[0-9]+$}}
%19 = or i8 %18, 48
%20 = add i8 %18, 87
%iftmp.5.0.4 = select i1 %17, i8 %19, i8 %20
- store i8 %iftmp.5.0.4, i8* null, align 1
+ store volatile i8 %iftmp.5.0.4, i8* null, align 1
; CHECK: umull [[REGISTER:lr|r[0-9]+]],
; CHECK-NOT: [[REGISTER]],
; CHECK: {{lr|r[0-9]+}}, {{lr|r[0-9]+$}}
%22 = urem i32 %21, 10
%23 = icmp ult i32 %22, 10
%iftmp.5.0.5 = select i1 %23, i8 0, i8 %val8
- store i8 %iftmp.5.0.5, i8* %p8, align 1
+ store volatile i8 %iftmp.5.0.5, i8* %p8, align 1
; CHECK: umull [[REGISTER:lr|r[0-9]+]],
; CHECK-NOT: [[REGISTER]],
; CHECK: {{lr|r[0-9]+}}, {{lr|r[0-9]+$}}
%28 = or i8 %27, 48
%29 = add i8 %27, 87
%iftmp.5.0.6 = select i1 %26, i8 %28, i8 %29
- store i8 %iftmp.5.0.6, i8* %p8, align 1
+ store volatile i8 %iftmp.5.0.6, i8* %p8, align 1
; CHECK: umull [[REGISTER:lr|r[0-9]+]],
; CHECK-NOT: [[REGISTER]],
; CHECK: {{lr|r[0-9]+}}, {{lr|r[0-9]+$}}
%34 = or i8 %33, 48
%35 = add i8 %33, 87
%iftmp.5.0.7 = select i1 %32, i8 %34, i8 %35
- store i8 %iftmp.5.0.7, i8* %p8, align 1
+ store volatile i8 %iftmp.5.0.7, i8* %p8, align 1
; CHECK: umull [[REGISTER:lr|r[0-9]+]],
; CHECK-NOT: [[REGISTER]],
; CHECK: {{lr|r[0-9]+}}, {{lr|r[0-9]+$}}
%40 = or i8 %39, 48
%41 = add i8 %39, 87
%iftmp.5.0.8 = select i1 %38, i8 %40, i8 %41
- store i8 %iftmp.5.0.8, i8* null, align 1
+ store volatile i8 %iftmp.5.0.8, i8* null, align 1
br label %bb46
bb46: ; preds = %bb3
; CHECK: sub sp, sp, #12
; CHECK: sub sp, sp, #4
; CHECK: add r0, sp, #4
-; CHECK: stm sp, {r0, r1, r2, r3}
+; CHECK: stmib sp, {r1, r2, r3}
%g = alloca i8*
%g1 = bitcast i8** %g to i8*
call void @llvm.va_start(i8* %g1)
; CHECK-LABEL: {{^}}main
; CHECK: mov [[TMP:r[0-9]+]], #0
; CHECK-NEXT: str [[TMP]], [sp, #4]
-; CHECK-NEXT: str [[TMP]], [sp]
+; CHECK_O0: str [[TMP]], [sp]
; CHECK_O0: ldr [[TMP:r[0-9]+]], [sp]
; CHECK_O0-NEXT: add [[TMP]], [[TMP]], #2
; CHECK_O1-NOT: ldr [[TMP:r[0-9]+]], [sp]
entry:
; CHECK-LABEL: va_arg:
%vl.addr = alloca i8*, align 2
-; CHECK: mov.w r12, 0(r1)
store i8* %vl, i8** %vl.addr, align 2
; CHECK: mov.w r12, [[REG:r[0-9]+]]
; CHECK-NEXT: add.w #2, [[REG]]
%0 = load <16 x i8>, <16 x i8>* @llvm_mips_bmnzi_b_ARG1
%1 = load <16 x i8>, <16 x i8>* @llvm_mips_bmnzi_b_ARG2
%2 = tail call <16 x i8> @llvm.mips.bmnzi.b(<16 x i8> %0, <16 x i8> %1, i32 240)
- store <16 x i8> %2, <16 x i8>* @llvm_mips_bmnzi_b_RES
+ store volatile <16 x i8> %2, <16 x i8>* @llvm_mips_bmnzi_b_RES
%3 = tail call <16 x i8> @llvm.mips.bmnzi.b(<16 x i8> %0, <16 x i8> %1, i32 15)
- store <16 x i8> %3, <16 x i8>* @llvm_mips_bmnzi_b_RES
+ store volatile <16 x i8> %3, <16 x i8>* @llvm_mips_bmnzi_b_RES
%4 = tail call <16 x i8> @llvm.mips.bmnzi.b(<16 x i8> %0, <16 x i8> %1, i32 170)
store <16 x i8> %4, <16 x i8>* @llvm_mips_bmnzi_b_RES
ret void
%0 = load <16 x i8>, <16 x i8>* @llvm_mips_bmnzi_b_ARG1
%1 = load <16 x i8>, <16 x i8>* @llvm_mips_bmnzi_b_ARG2
%2 = tail call <16 x i8> @llvm.mips.bmzi.b(<16 x i8> %0, <16 x i8> %1, i32 240)
- store <16 x i8> %2, <16 x i8>* @llvm_mips_bmnzi_b_RES
+ store volatile <16 x i8> %2, <16 x i8>* @llvm_mips_bmnzi_b_RES
%3 = tail call <16 x i8> @llvm.mips.bmzi.b(<16 x i8> %0, <16 x i8> %1, i32 15)
- store <16 x i8> %3, <16 x i8>* @llvm_mips_bmnzi_b_RES
+ store volatile <16 x i8> %3, <16 x i8>* @llvm_mips_bmnzi_b_RES
%4 = tail call <16 x i8> @llvm.mips.bmzi.b(<16 x i8> %0, <16 x i8> %1, i32 170)
store <16 x i8> %4, <16 x i8>* @llvm_mips_bmnzi_b_RES
ret void
%0 = load ppc_fp128, ppc_fp128* @ld, align 16
%1 = load ppc_fp128, ppc_fp128* @ld2, align 16
%add = fadd ppc_fp128 %0, %1
- store ppc_fp128 %add, ppc_fp128* %c, align 16
+ store volatile ppc_fp128 %add, ppc_fp128* %c, align 16
%2 = load ppc_fp128, ppc_fp128* @ld, align 16
%3 = load ppc_fp128, ppc_fp128* @ld2, align 16
%sub = fsub ppc_fp128 %2, %3
- store ppc_fp128 %sub, ppc_fp128* %c, align 16
+ store volatile ppc_fp128 %sub, ppc_fp128* %c, align 16
%4 = load ppc_fp128, ppc_fp128* @ld, align 16
%5 = load ppc_fp128, ppc_fp128* @ld2, align 16
%mul = fmul ppc_fp128 %4, %5
- store ppc_fp128 %mul, ppc_fp128* %c, align 16
+ store volatile ppc_fp128 %mul, ppc_fp128* %c, align 16
%6 = load ppc_fp128, ppc_fp128* @ld, align 16
%7 = load ppc_fp128, ppc_fp128* @ld2, align 16
%div = fdiv ppc_fp128 %6, %7
- store ppc_fp128 %div, ppc_fp128* %c, align 16
+ store volatile ppc_fp128 %div, ppc_fp128* %c, align 16
ret void
; CHECK-LABEL: __gcc_qadd
i32 %a5, ; %i5
i32 signext %a6, ; [%fp+92]
i8* %a7) { ; [%fp+96]
- store i8 %a0, i8* %a4
- store i8 %a1, i8* %a4
+ store volatile i8 %a0, i8* %a4
+ store volatile i8 %a1, i8* %a4
%p16 = bitcast i8* %a4 to i16*
- store i16 %a2, i16* %p16
+ store volatile i16 %a2, i16* %p16
%p32 = bitcast i8* %a4 to i32*
- store i32 %a3, i32* %p32
+ store volatile i32 %a3, i32* %p32
%pp = bitcast i8* %a4 to i8**
- store i8* %a4, i8** %pp
- store i32 %a5, i32* %p32
- store i32 %a6, i32* %p32
- store i8* %a7, i8** %pp
+ store volatile i8* %a4, i8** %pp
+ store volatile i32 %a5, i32* %p32
+ store volatile i32 %a6, i32* %p32
+ store volatile i8* %a7, i8** %pp
ret void
}
i32 %a5, ; %i5
i32 signext %a6, ; [%fp+BIAS+176]
i8* %a7) { ; [%fp+BIAS+184]
- store i8 %a0, i8* %a4
- store i8 %a1, i8* %a4
+ store volatile i8 %a0, i8* %a4
+ store volatile i8 %a1, i8* %a4
%p16 = bitcast i8* %a4 to i16*
- store i16 %a2, i16* %p16
+ store volatile i16 %a2, i16* %p16
%p32 = bitcast i8* %a4 to i32*
- store i32 %a3, i32* %p32
+ store volatile i32 %a3, i32* %p32
%pp = bitcast i8* %a4 to i8**
- store i8* %a4, i8** %pp
- store i32 %a5, i32* %p32
- store i32 %a6, i32* %p32
- store i8* %a7, i8** %pp
+ store volatile i8* %a4, i8** %pp
+ store volatile i32 %a5, i32* %p32
+ store volatile i32 %a6, i32* %p32
+ store volatile i8* %a7, i8** %pp
ret void
}
%rv = call { i64, i64 } @ret_i64_pair(i32 undef, i32 undef,
i64* undef, i64* undef)
%e0 = extractvalue { i64, i64 } %rv, 0
- store i64 %e0, i64* %i0
+ store volatile i64 %e0, i64* %i0
%e1 = extractvalue { i64, i64 } %rv, 1
store i64 %e1, i64* %i0
ret void
%v6 = extractvalue { i1, i1, i1, i1 } %call, 2
%v7 = extractvalue { i1, i1, i1, i1 } %call, 3
%val = zext i1 %v3 to i32
- store i32 %val, i32* @var
+ store volatile i32 %val, i32* @var
%val2 = zext i1 %v5 to i32
- store i32 %val2, i32* @var
+ store volatile i32 %val2, i32* @var
%val3 = zext i1 %v6 to i32
- store i32 %val3, i32* @var
+ store volatile i32 %val3, i32* @var
%val4 = zext i1 %v7 to i32
store i32 %val4, i32* @var
ret void
%z = alloca i8, align 1
; CHECK: add r1, sp, #8
; CHECK: str r1, [r0]
- store i8* %x, i8** %p, align 4
+ store volatile i8* %x, i8** %p, align 4
; CHECK: add r1, sp, #4
; CHECK: str r1, [r0]
- store i8* %y, i8** %p, align 4
+ store volatile i8* %y, i8** %p, align 4
; CHECK: mov r1, sp
; CHECK: str r1, [r0]
- store i8* %z, i8** %p, align 4
+ store volatile i8* %z, i8** %p, align 4
ret void
}
; CHECK: add r1, sp, #1020
; CHECK: adds r1, #4
; CHECK: str r1, [r0]
- store [1024 x i8]* %arr1, [1024 x i8]** %p, align 4
+ store volatile [1024 x i8]* %arr1, [1024 x i8]** %p, align 4
; CHECK: mov r1, sp
; CHECK: str r1, [r0]
- store [1024 x i8]* %arr2, [1024 x i8]** %p, align 4
+ store volatile [1024 x i8]* %arr2, [1024 x i8]** %p, align 4
ret void
}
; CHECK: str{{(.w)?}} r{{[0-9]+}}, [sp
; CHECK: str{{(.w)?}} r{{[0-9]+}}, [sp
; CHECK: str{{(.w)?}} r{{[0-9]+}}, [sp
- store %union.rec* null, %union.rec** @zz_hold, align 4
+ store volatile %union.rec* null, %union.rec** @zz_hold, align 4
store %union.rec* null, %union.rec** @zz_res, align 4
- store %union.rec* %x, %union.rec** @zz_hold, align 4
+ store volatile %union.rec* %x, %union.rec** @zz_hold, align 4
%0 = call %union.rec* @Manifest(%union.rec* undef, %union.rec* %env, %struct.STYLE* %style, %union.rec** %bthr, %union.rec** %fthr, %union.rec** %target, %union.rec** %crs, i32 %ok, i32 %need_expand, %union.rec** %enclose, i32 %fcr) nounwind ; <%union.rec*> [#uses=0]
unreachable
; CHECK: calll _addrof_i32
; CHECK: retl
-
; Don't elide the copy when the alloca is escaped with a store.
-
define void @escape_with_store(i32 %x) {
%x1 = alloca i32
%x2 = alloca i32*
}
; CHECK-LABEL: _escape_with_store:
-; CHECK-DAG: movl {{.*}}(%esp), %[[reg:[^ ]*]]
-; CHECK-DAG: movl $0, [[offs:[0-9]*]](%esp)
-; CHECK: movl %[[reg]], [[offs]](%esp)
+; CHECK: movl {{.*}}(%esp), %[[reg:[^ ]*]]
+; CHECK: movl %[[reg]], [[offs:[0-9]*]](%esp)
; CHECK: calll _addrof_i32
; X32-SSE: # BB#0:
; X32-SSE-NEXT: pushl %ebp
; X32-SSE-NEXT: movl %esp, %ebp
-; X32-SSE-NEXT: pushl %esi
; X32-SSE-NEXT: andl $-16, %esp
; X32-SSE-NEXT: subl $16, %esp
; X32-SSE-NEXT: movl 72(%ebp), %eax
; X32-SSE-NEXT: movl 76(%ebp), %ecx
-; X32-SSE-NEXT: movl 12(%ebp), %edx
; X32-SSE-NEXT: movdqa 56(%ebp), %xmm3
; X32-SSE-NEXT: movdqa 40(%ebp), %xmm4
; X32-SSE-NEXT: movdqa 24(%ebp), %xmm5
-; X32-SSE-NEXT: movl 8(%ebp), %esi
-; X32-SSE-NEXT: addps .LCPI0_0, %xmm0
-; X32-SSE-NEXT: movntps %xmm0, (%esi)
-; X32-SSE-NEXT: paddq .LCPI0_1, %xmm2
-; X32-SSE-NEXT: movntdq %xmm2, (%esi)
-; X32-SSE-NEXT: addpd .LCPI0_2, %xmm1
-; X32-SSE-NEXT: movntpd %xmm1, (%esi)
-; X32-SSE-NEXT: paddd .LCPI0_3, %xmm5
-; X32-SSE-NEXT: movntdq %xmm5, (%esi)
-; X32-SSE-NEXT: paddw .LCPI0_4, %xmm4
-; X32-SSE-NEXT: movntdq %xmm4, (%esi)
-; X32-SSE-NEXT: paddb .LCPI0_5, %xmm3
-; X32-SSE-NEXT: movntdq %xmm3, (%esi)
-; X32-SSE-NEXT: movntil %edx, (%esi)
-; X32-SSE-NEXT: movntil %ecx, 4(%esi)
-; X32-SSE-NEXT: movntil %eax, (%esi)
-; X32-SSE-NEXT: leal -4(%ebp), %esp
-; X32-SSE-NEXT: popl %esi
+; X32-SSE-NEXT: movl 8(%ebp), %edx
+; X32-SSE-NEXT: addps {{\.LCPI.*}}, %xmm0
+; X32-SSE-NEXT: movntps %xmm0, (%edx)
+; X32-SSE-NEXT: paddq {{\.LCPI.*}}, %xmm2
+; X32-SSE-NEXT: movntdq %xmm2, (%edx)
+; X32-SSE-NEXT: addpd {{\.LCPI.*}}, %xmm1
+; X32-SSE-NEXT: movntpd %xmm1, (%edx)
+; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm5
+; X32-SSE-NEXT: movntdq %xmm5, (%edx)
+; X32-SSE-NEXT: paddw {{\.LCPI.*}}, %xmm4
+; X32-SSE-NEXT: movntdq %xmm4, (%edx)
+; X32-SSE-NEXT: paddb {{\.LCPI.*}}, %xmm3
+; X32-SSE-NEXT: movntdq %xmm3, (%edx)
+; X32-SSE-NEXT: movntil %ecx, 4(%edx)
+; X32-SSE-NEXT: movntil %eax, (%edx)
+; X32-SSE-NEXT: movl %ebp, %esp
; X32-SSE-NEXT: popl %ebp
; X32-SSE-NEXT: retl
;
; X32-AVX: # BB#0:
; X32-AVX-NEXT: pushl %ebp
; X32-AVX-NEXT: movl %esp, %ebp
-; X32-AVX-NEXT: pushl %esi
; X32-AVX-NEXT: andl $-16, %esp
; X32-AVX-NEXT: subl $16, %esp
; X32-AVX-NEXT: movl 72(%ebp), %eax
; X32-AVX-NEXT: movl 76(%ebp), %ecx
-; X32-AVX-NEXT: movl 12(%ebp), %edx
; X32-AVX-NEXT: vmovdqa 56(%ebp), %xmm3
; X32-AVX-NEXT: vmovdqa 40(%ebp), %xmm4
; X32-AVX-NEXT: vmovdqa 24(%ebp), %xmm5
-; X32-AVX-NEXT: movl 8(%ebp), %esi
-; X32-AVX-NEXT: vaddps .LCPI0_0, %xmm0, %xmm0
-; X32-AVX-NEXT: vmovntps %xmm0, (%esi)
-; X32-AVX-NEXT: vpaddq .LCPI0_1, %xmm2, %xmm0
-; X32-AVX-NEXT: vmovntdq %xmm0, (%esi)
-; X32-AVX-NEXT: vaddpd .LCPI0_2, %xmm1, %xmm0
-; X32-AVX-NEXT: vmovntpd %xmm0, (%esi)
-; X32-AVX-NEXT: vpaddd .LCPI0_3, %xmm5, %xmm0
-; X32-AVX-NEXT: vmovntdq %xmm0, (%esi)
-; X32-AVX-NEXT: vpaddw .LCPI0_4, %xmm4, %xmm0
-; X32-AVX-NEXT: vmovntdq %xmm0, (%esi)
-; X32-AVX-NEXT: vpaddb .LCPI0_5, %xmm3, %xmm0
-; X32-AVX-NEXT: vmovntdq %xmm0, (%esi)
-; X32-AVX-NEXT: movntil %edx, (%esi)
-; X32-AVX-NEXT: movntil %ecx, 4(%esi)
-; X32-AVX-NEXT: movntil %eax, (%esi)
-; X32-AVX-NEXT: leal -4(%ebp), %esp
-; X32-AVX-NEXT: popl %esi
+; X32-AVX-NEXT: movl 8(%ebp), %edx
+; X32-AVX-NEXT: vaddps {{\.LCPI.*}}, %xmm0, %xmm0
+; X32-AVX-NEXT: vmovntps %xmm0, (%edx)
+; X32-AVX-NEXT: vpaddq {{\.LCPI.*}}, %xmm2, %xmm0
+; X32-AVX-NEXT: vmovntdq %xmm0, (%edx)
+; X32-AVX-NEXT: vaddpd {{\.LCPI.*}}, %xmm1, %xmm0
+; X32-AVX-NEXT: vmovntpd %xmm0, (%edx)
+; X32-AVX-NEXT: vpaddd {{\.LCPI.*}}, %xmm5, %xmm0
+; X32-AVX-NEXT: vmovntdq %xmm0, (%edx)
+; X32-AVX-NEXT: vpaddw {{\.LCPI.*}}, %xmm4, %xmm0
+; X32-AVX-NEXT: vmovntdq %xmm0, (%edx)
+; X32-AVX-NEXT: vpaddb {{\.LCPI.*}}, %xmm3, %xmm0
+; X32-AVX-NEXT: vmovntdq %xmm0, (%edx)
+; X32-AVX-NEXT: movntil %ecx, 4(%edx)
+; X32-AVX-NEXT: movntil %eax, (%edx)
+; X32-AVX-NEXT: movl %ebp, %esp
; X32-AVX-NEXT: popl %ebp
; X32-AVX-NEXT: retl
;
@g_16 = internal global i32 -1
; X64-LABEL: test8:
-; X64-NEXT: movl _g_16(%rip), %eax
-; X64-NEXT: movl $0, _g_16(%rip)
-; X64-NEXT: orl $1, %eax
-; X64-NEXT: movl %eax, _g_16(%rip)
+; X64-NEXT: orb $1, _g_16(%rip)
; X64-NEXT: ret
define void @test8() nounwind {
%tmp = load i32, i32* @g_16
%v6 = extractvalue { i1, i1, i1, i1 } %call, 2
%v7 = extractvalue { i1, i1, i1, i1 } %call, 3
%val = zext i1 %v3 to i32
- store i32 %val, i32* @var
+ store volatile i32 %val, i32* @var
%val2 = zext i1 %v5 to i32
- store i32 %val2, i32* @var
+ store volatile i32 %val2, i32* @var
%val3 = zext i1 %v6 to i32
- store i32 %val3, i32* @var
+ store volatile i32 %val3, i32* @var
%val4 = zext i1 %v7 to i32
store i32 %val4, i32* @var
ret void
; Check that proper alignment of spilled vector does not affect vargs
; CHECK-LABEL: vargs_not_affected
-; CHECK: leal 28(%ebp), %eax
+; CHECK: movl 28(%ebp), %eax
define i32 @vargs_not_affected(<4 x float> %v, i8* %f, ...) {
entry:
%ap = alloca i8*, align 4
; LINUX: movq $0, -8(%rsp)
%this = alloca %Object addrspace(1)*
- store %Object addrspace(1)* null, %Object addrspace(1)** %this
- store %Object addrspace(1)* %param0, %Object addrspace(1)** %this
+ store volatile %Object addrspace(1)* null, %Object addrspace(1)** %this
+ store volatile %Object addrspace(1)* %param0, %Object addrspace(1)** %this
br label %0
; <label>:0 ; preds = %entry
; CHECK-LABEL: arg4:
; CHECK: pushq
-; va_start:
-; CHECK: leaq 48(%rsp), [[REG_arg4_1:%[a-z]+]]
-; CHECK: movq [[REG_arg4_1]], (%rsp)
+; va_start (optimized away as overwritten by va_arg)
; va_arg:
; CHECK: leaq 52(%rsp), [[REG_arg4_2:%[a-z]+]]
; CHECK: movq [[REG_arg4_2]], (%rsp)
}
; CHECK-LABEL: arg4:
-; va_start:
-; CHECK: leaq 48(%rsp), [[REG_arg4_1:%[a-z]+]]
-; CHECK: movq [[REG_arg4_1]], (%rsp)
+; va_start (optimized away as overwritten by va_arg)
; va_arg:
; CHECK: leaq 52(%rsp), [[REG_arg4_2:%[a-z]+]]
; CHECK: movq [[REG_arg4_2]], (%rsp)