From d6351340bb9bf7703fb79629efdba5886d434694 Mon Sep 17 00:00:00 2001 From: Nirav Dave Date: Wed, 13 Mar 2019 17:07:09 +0000 Subject: [PATCH] [DAGCombiner] If a TokenFactor would be merged into its user, consider the user later. Summary: A number of optimizations are inhibited by single-use TokenFactors not being merged into the TokenFactor using it. This makes we consider if we can do the merge immediately. Most tests changes here are due to the change in visitation causing minor reorderings and associated reassociation of paired memory operations. CodeGen tests with non-reordering changes: X86/aligned-variadic.ll -- memory-based add folded into stored leaq value. X86/constant-combiners.ll -- Optimizes out overlap between stores. X86/pr40631_deadstore_elision -- folds constant byte store into preceding quad word constant store. Reviewers: RKSimon, craig.topper, spatel, efriedma, courbet Reviewed By: courbet Subscribers: dylanmckay, sdardis, nemanjai, jvesely, nhaehnle, javed.absar, eraman, hiraditya, kbarton, jrtc27, atanasyan, jsji, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D59260 llvm-svn: 356068 --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 6 + .../test/CodeGen/AArch64/aarch64_win64cc_vararg.ll | 8 +- llvm/test/CodeGen/AArch64/addr-of-ret-addr.ll | 2 +- llvm/test/CodeGen/AArch64/alloca.ll | 14 +- llvm/test/CodeGen/AArch64/arm64-memcpy-inline.ll | 16 +- llvm/test/CodeGen/AArch64/arm64-variadic-aapcs.ll | 27 +-- llvm/test/CodeGen/AArch64/win64_vararg.ll | 32 +-- llvm/test/CodeGen/AMDGPU/call-argument-types.ll | 2 +- .../CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll | 4 +- .../CodeGen/ARM/2012-10-04-FixedFrame-vs-byval.ll | 4 +- .../ARM/2014-02-21-byval-reg-split-alignment.ll | 8 +- llvm/test/CodeGen/ARM/memset-inline.ll | 12 +- llvm/test/CodeGen/ARM/thumb1_return_sequence.ll | 4 +- llvm/test/CodeGen/ARM/unaligned_load_store.ll | 16 +- llvm/test/CodeGen/AVR/calling-conv/c/basic.ll | 32 +-- llvm/test/CodeGen/AVR/directmem.ll | 24 +-- llvm/test/CodeGen/BPF/undef.ll | 28 +-- llvm/test/CodeGen/MSP430/cc_args.ll | 8 +- llvm/test/CodeGen/Mips/v2i16tof32.ll | 22 +- llvm/test/CodeGen/PowerPC/f128-aggregates.ll | 86 ++++---- llvm/test/CodeGen/PowerPC/ppc64-byval-align.ll | 2 +- llvm/test/CodeGen/Thumb/frame-access.ll | 6 +- llvm/test/CodeGen/Thumb/mvn.ll | 12 +- llvm/test/CodeGen/X86/aligned-variadic.ll | 2 +- llvm/test/CodeGen/X86/atomic-idempotent.ll | 6 +- llvm/test/CodeGen/X86/avx-load-store.ll | 4 +- llvm/test/CodeGen/X86/btc_bts_btr.ll | 6 +- llvm/test/CodeGen/X86/combine-sbb.ll | 2 +- llvm/test/CodeGen/X86/constant-combines.ll | 2 +- llvm/test/CodeGen/X86/min-legal-vector-width.ll | 38 ++-- llvm/test/CodeGen/X86/musttail-varargs.ll | 20 +- llvm/test/CodeGen/X86/musttail.ll | 8 +- llvm/test/CodeGen/X86/nosse-vector.ll | 8 +- llvm/test/CodeGen/X86/oddshuffles.ll | 238 ++++++++++----------- llvm/test/CodeGen/X86/pr40631_deadstore_elision.ll | 3 +- llvm/test/CodeGen/X86/rotate.ll | 2 +- llvm/test/CodeGen/X86/rotate4.ll | 24 +-- llvm/test/CodeGen/X86/sadd_sat_vec.ll | 4 +- llvm/test/CodeGen/X86/shift-and.ll | 2 +- llvm/test/CodeGen/X86/shrink_vmul-widen.ll | 64 +++--- llvm/test/CodeGen/X86/shrink_vmul.ll | 64 +++--- llvm/test/CodeGen/X86/ssub_sat_vec.ll | 4 +- llvm/test/CodeGen/X86/uadd_sat_vec.ll | 4 +- llvm/test/CodeGen/X86/usub_sat_vec.ll | 4 +- llvm/test/CodeGen/X86/vastart-defs-eflags.ll | 8 +- llvm/test/CodeGen/X86/vec_fpext.ll | 32 +-- llvm/test/CodeGen/X86/widen_cast-2.ll | 4 +- llvm/test/CodeGen/X86/widen_load-2.ll | 40 ++-- llvm/test/CodeGen/X86/win64_frame.ll | 4 +- llvm/test/CodeGen/X86/win64_vararg.ll | 6 +- llvm/test/CodeGen/X86/x86-64-ms_abi-vararg.ll | 10 +- llvm/test/CodeGen/XCore/byVal.ll | 4 +- 52 files changed, 498 insertions(+), 494 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 9ff62ad..1095b41 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -1709,6 +1709,12 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) { if (OptLevel == CodeGenOpt::None) return SDValue(); + // If this is used only a single token factor, we should make sure we have a + // chance to merge them together. This prevents TF chains from inhibiting + // optimizations. + if (N->hasOneUse() && N->use_begin()->getOpcode() == ISD::TokenFactor) + AddToWorklist(*(N->use_begin())); + SmallVector TFs; // List of token factors to visit. SmallVector Ops; // Ops for replacing token factor. SmallPtrSet SeenOps; diff --git a/llvm/test/CodeGen/AArch64/aarch64_win64cc_vararg.ll b/llvm/test/CodeGen/AArch64/aarch64_win64cc_vararg.ll index 43b821f..a45ae74 100644 --- a/llvm/test/CodeGen/AArch64/aarch64_win64cc_vararg.ll +++ b/llvm/test/CodeGen/AArch64/aarch64_win64cc_vararg.ll @@ -5,10 +5,10 @@ entry: ; CHECK: str x30, [sp, #-80]! ; CHECK: add x8, sp, #24 ; CHECK: add x0, sp, #24 -; CHECK: stp x6, x7, [sp, #64] -; CHECK: stp x4, x5, [sp, #48] -; CHECK: stp x2, x3, [sp, #32] -; CHECK: str x1, [sp, #24] +; CHECK: stp x1, x2, [sp, #24] +; CHECK: stp x3, x4, [sp, #40] +; CHECK: stp x5, x6, [sp, #56] +; CHECK: str x7, [sp, #72] ; CHECK: str x8, [sp, #8] ; CHECK: bl other_func ; CHECK: ldr x30, [sp], #80 diff --git a/llvm/test/CodeGen/AArch64/addr-of-ret-addr.ll b/llvm/test/CodeGen/AArch64/addr-of-ret-addr.ll index b099b18..a6bc364 100644 --- a/llvm/test/CodeGen/AArch64/addr-of-ret-addr.ll +++ b/llvm/test/CodeGen/AArch64/addr-of-ret-addr.ll @@ -44,7 +44,7 @@ entry: ; CHECK: sub sp, sp, #96 ; CHECK: stp x29, x30, [sp, #16] ; CHECK: add x29, sp, #16 -; CHECK: str x1, [x29, #24] +; CHECK: stp x1, x2, [x29, #24] ; CHECK: add x1, x29, #8 ; CHECK: ldp x29, x30, [sp, #16] ; CHECK: add sp, sp, #96 diff --git a/llvm/test/CodeGen/AArch64/alloca.ll b/llvm/test/CodeGen/AArch64/alloca.ll index ab7a631..25bb3c8 100644 --- a/llvm/test/CodeGen/AArch64/alloca.ll +++ b/llvm/test/CodeGen/AArch64/alloca.ll @@ -78,22 +78,22 @@ define void @test_variadic_alloca(i64 %n, ...) { ; CHECK: stp x29, x30, [sp, #-16]! ; CHECK: mov x29, sp ; CHECK: sub sp, sp, #192 -; CHECK: stp q6, q7, [x29, #-96] +; CHECK-DAG: stp q6, q7, [x29, #-96] ; [...] -; CHECK: stp q0, q1, [x29, #-192] +; CHECK-DAG: stp q0, q1, [x29, #-192] -; CHECK: stp x6, x7, [x29, #-16] +; CHECK-DAG: stp x5, x6, [x29, #-24] ; [...] -; CHECK: stp x2, x3, [x29, #-48] +; CHECK-DAG: stp x1, x2, [x29, #-56] ; CHECK-NOFP-ARM64: stp x29, x30, [sp, #-16]! ; CHECK-NOFP-ARM64: mov x29, sp ; CHECK-NOFP-ARM64: sub sp, sp, #64 -; CHECK-NOFP-ARM64: stp x6, x7, [x29, #-16] +; CHECK-NOFP-ARM64-DAG: stp x5, x6, [x29, #-24] ; [...] -; CHECK-NOFP-ARM64: stp x4, x5, [x29, #-32] +; CHECK-NOFP-ARM64-DAG: stp x3, x4, [x29, #-40] ; [...] -; CHECK-NOFP-ARM64: stp x2, x3, [x29, #-48] +; CHECK-NOFP-ARM64-DAG: stp x1, x2, [x29, #-56] ; [...] ; CHECK-NOFP-ARM64: mov x8, sp diff --git a/llvm/test/CodeGen/AArch64/arm64-memcpy-inline.ll b/llvm/test/CodeGen/AArch64/arm64-memcpy-inline.ll index 629cf37..f6d66b6 100644 --- a/llvm/test/CodeGen/AArch64/arm64-memcpy-inline.ll +++ b/llvm/test/CodeGen/AArch64/arm64-memcpy-inline.ll @@ -16,10 +16,10 @@ define i32 @t0() { entry: ; CHECK-LABEL: t0: -; CHECK: ldur [[REG0:w[0-9]+]], [x[[BASEREG:[0-9]+]], #7] -; CHECK: stur [[REG0]], [x[[BASEREG2:[0-9]+]], #7] -; CHECK: ldr [[REG2:x[0-9]+]], -; CHECK: str [[REG2]], +; CHECK-DAG: ldur [[REG0:w[0-9]+]], [x[[BASEREG:[0-9]+]], #7] +; CHECK-DAG: stur [[REG0]], [x[[BASEREG2:[0-9]+]], #7] +; CHECK-DAG: ldr [[REG2:x[0-9]+]], +; CHECK-DAG: str [[REG2]], call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 8 getelementptr inbounds (%struct.x, %struct.x* @dst, i32 0, i32 0), i8* align 8 getelementptr inbounds (%struct.x, %struct.x* @src, i32 0, i32 0), i32 11, i1 false) ret i32 0 } @@ -85,10 +85,10 @@ entry: define void @t6() nounwind { entry: ; CHECK-LABEL: t6: -; CHECK: ldur [[REG9:x[0-9]+]], [x{{[0-9]+}}, #6] -; CHECK: stur [[REG9]], [x{{[0-9]+}}, #6] -; CHECK: ldr -; CHECK: str +; CHECK-DAG: ldur [[REG9:x[0-9]+]], [x{{[0-9]+}}, #6] +; CHECK-DAG: stur [[REG9]], [x{{[0-9]+}}, #6] +; CHECK-DAG: ldr +; CHECK-DAG: str call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([512 x i8], [512 x i8]* @spool.splbuf, i64 0, i64 0), i8* getelementptr inbounds ([14 x i8], [14 x i8]* @.str6, i64 0, i64 0), i64 14, i1 false) ret void } diff --git a/llvm/test/CodeGen/AArch64/arm64-variadic-aapcs.ll b/llvm/test/CodeGen/AArch64/arm64-variadic-aapcs.ll index 0912529..db87d7f 100644 --- a/llvm/test/CodeGen/AArch64/arm64-variadic-aapcs.ll +++ b/llvm/test/CodeGen/AArch64/arm64-variadic-aapcs.ll @@ -14,13 +14,13 @@ define void @test_simple(i32 %n, ...) { ; CHECK: adrp x[[VA_LIST_HI:[0-9]+]], var ; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, :lo12:var -; CHECK: stp x1, x2, [sp, #[[GR_BASE:[0-9]+]]] +; CHECK-DAG: stp x6, x7, [sp, # ; ... omit middle ones ... -; CHECK: str x7, [sp, # +; CHECK-DAG: str x1, [sp, #[[GR_BASE:[0-9]+]]] -; CHECK: stp q0, q1, [sp] +; CHECK-DAG: stp q0, q1, [sp] ; ... omit middle ones ... -; CHECK: stp q6, q7, [sp, # +; CHECK-DAG: stp q6, q7, [sp, # ; CHECK: str [[STACK_TOP]], [x[[VA_LIST]]] @@ -50,13 +50,13 @@ define void @test_fewargs(i32 %n, i32 %n1, i32 %n2, float %m, ...) { ; CHECK: adrp x[[VA_LIST_HI:[0-9]+]], var ; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, :lo12:var -; CHECK: stp x3, x4, [sp, #[[GR_BASE:[0-9]+]]] +; CHECK-DAG: stp x6, x7, [sp, # ; ... omit middle ones ... -; CHECK: str x7, [sp, # +; CHECK-DAG: str x3, [sp, #[[GR_BASE:[0-9]+]]] -; CHECK: stp q1, q2, [sp] +; CHECK-DAG: stp q6, q7, [sp, #80] ; ... omit middle ones ... -; CHECK: str q7, [sp, # +; CHECK-DAG: str q1, [sp] ; CHECK: str [[STACK_TOP]], [x[[VA_LIST]]] @@ -95,10 +95,13 @@ define void @test_nospare([8 x i64], [8 x float], ...) { ; __stack field should point just past them. define void @test_offsetstack([8 x i64], [2 x i64], [3 x float], ...) { ; CHECK-LABEL: test_offsetstack: -; CHECK: stp {{q[0-9]+}}, {{q[0-9]+}}, [sp, #-80]! -; CHECK: add [[STACK_TOP:x[0-9]+]], sp, #96 -; CHECK: add x[[VAR:[0-9]+]], {{x[0-9]+}}, :lo12:var -; CHECK: str [[STACK_TOP]], [x[[VAR]]] + +; CHECK-DAG: stp {{q[0-9]+}}, {{q[0-9]+}}, [sp, #48] +; CHECK-DAG: stp {{q[0-9]+}}, {{q[0-9]+}}, [sp, #16] +; CHECK-DAG: str {{q[0-9]+}}, [sp] +; CHECK-DAG: add [[STACK_TOP:x[0-9]+]], sp, #96 +; CHECK-DAG: add x[[VAR:[0-9]+]], {{x[0-9]+}}, :lo12:var +; CHECK-DAG: str [[STACK_TOP]], [x[[VAR]]] %addr = bitcast %va_list* @var to i8* call void @llvm.va_start(i8* %addr) diff --git a/llvm/test/CodeGen/AArch64/win64_vararg.ll b/llvm/test/CodeGen/AArch64/win64_vararg.ll index 38da60b..d9bb2ff 100644 --- a/llvm/test/CodeGen/AArch64/win64_vararg.ll +++ b/llvm/test/CodeGen/AArch64/win64_vararg.ll @@ -5,10 +5,10 @@ entry: ; CHECK: str x30, [sp, #-80]! ; CHECK: add x8, sp, #24 ; CHECK: add x0, sp, #24 -; CHECK: stp x6, x7, [sp, #64] -; CHECK: stp x4, x5, [sp, #48] -; CHECK: stp x2, x3, [sp, #32] -; CHECK: str x1, [sp, #24] +; CHECK: stp x1, x2, [sp, #24] +; CHECK: stp x3, x4, [sp, #40] +; CHECK: stp x5, x6, [sp, #56] +; CHECK: str x7, [sp, #72] ; CHECK: str x8, [sp, #8] ; CHECK: bl other_func ; CHECK: ldr x30, [sp], #80 @@ -78,10 +78,10 @@ entry: ; CHECK-LABEL: copy1: ; CHECK: sub sp, sp, #80 ; CHECK: add x8, sp, #24 -; CHECK: stp x6, x7, [sp, #64] -; CHECK: stp x4, x5, [sp, #48] -; CHECK: stp x2, x3, [sp, #32] -; CHECK: str x1, [sp, #24] +; CHECK: stp x1, x2, [sp, #24] +; CHECK: stp x3, x4, [sp, #40] +; CHECK: stp x5, x6, [sp, #56] +; CHECK: str x7, [sp, #72] ; CHECK: stp x8, x8, [sp], #80 ; CHECK: ret define void @copy1(i64 %a0, ...) nounwind { @@ -111,9 +111,9 @@ declare i64* @__local_stdio_printf_options() local_unnamed_addr #4 ; CHECK: mov x19, x2 ; CHECK: mov x20, x1 ; CHECK: mov x21, x0 -; CHECK: stp x6, x7, [x29, #48] -; CHECK: stp x4, x5, [x29, #32] -; CHECK: str x3, [x29, #24] +; CHECK: stp x3, x4, [x29, #24] +; CHECK: stp x5, x6, [x29, #40] +; CHECK: str x7, [x29, #56] ; CHECK: str x8, [sp, #8] ; CHECK: bl __local_stdio_printf_options ; CHECK: ldr x8, [x0] @@ -162,9 +162,9 @@ attributes #6 = { "no-frame-pointer-elim"="true" } ; CHECK: lsr x15, x8, #4 ; CHECK: mov x19, x1 ; CHECK: mov [[REG2:x[0-9]+]], sp -; CHECK: stp x6, x7, [x29, #48] -; CHECK: stp x4, x5, [x29, #32] ; CHECK: stp x2, x3, [x29, #16] +; CHECK: stp x4, x5, [x29, #32] +; CHECK: stp x6, x7, [x29, #48] ; CHECK: bl __chkstk ; CHECK: mov x8, sp ; CHECK: sub [[REG:x[0-9]+]], x8, x15, lsl #4 @@ -219,9 +219,9 @@ declare void @llvm.stackrestore(i8*) ; CHECK-DAG: mov x19, x2 ; CHECK-DAG: mov x20, x1 ; CHECK-DAG: mov x21, x0 -; CHECK-DAG: stp x6, x7, [sp, #80] -; CHECK-DAG: stp x4, x5, [sp, #64] -; CHECK-DAG: str x3, [sp, #56] +; CHECK-DAG: stp x3, x4, [sp, #56] +; CHECK-DAG: stp x5, x6, [sp, #72] +; CHECK-DAG: str x7, [sp, #88] ; CHECK-DAG: str x8, [sp, #8] ; CHECK-DAG: bl __local_stdio_printf_options ; CHECK-DAG: ldr x8, [x0] diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll index 78ccc8c..4a4bb0f 100644 --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -752,8 +752,8 @@ entry: ; GCN: buffer_store_dword v33, off, s[0:3], s5 offset:12 ; 4-byte Folded Spill ; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:4 ; GCN: buffer_load_dword v33, off, s[0:3], s5 offset:8 -; GCN: buffer_store_dword v33, off, s[0:3], s5 offset:8 ; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:4 +; GCN: buffer_store_dword v33, off, s[0:3], s5 offset:8 ; GCN: s_getpc_b64 ; GCN: buffer_load_dword v33, off, s[0:3], s5 offset:12 ; 4-byte Folded Reload ; GCN: buffer_load_dword v32, off, s[0:3], s5 offset:16 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll b/llvm/test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll index 9548602..1b0dbe9 100644 --- a/llvm/test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll +++ b/llvm/test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll @@ -42,8 +42,8 @@ declare void @f(double); ; CHECK-LABEL: test_byval_8_bytes_alignment_fixed_arg: ; CHECK-NOT: str r1 -; CHECK: str r3, [sp, #12] -; CHECK: str r2, [sp, #8] +; CHECK-DAG: str r3, [sp, #12] +; CHECK-DAG: str r2, [sp, #8] ; CHECK-NOT: str r1 define void @test_byval_8_bytes_alignment_fixed_arg(i32 %n1, %struct_t* byval %val) nounwind { entry: diff --git a/llvm/test/CodeGen/ARM/2012-10-04-FixedFrame-vs-byval.ll b/llvm/test/CodeGen/ARM/2012-10-04-FixedFrame-vs-byval.ll index 34af902..1530d64 100644 --- a/llvm/test/CodeGen/ARM/2012-10-04-FixedFrame-vs-byval.ll +++ b/llvm/test/CodeGen/ARM/2012-10-04-FixedFrame-vs-byval.ll @@ -7,8 +7,8 @@ declare i32 @printf(i8*, ...) ; CHECK-LABEL: test_byval_usage_scheduling: -; CHECK: str r3, [sp, #12] -; CHECK: str r2, [sp, #8] +; CHECK-DAG: str r3, [sp, #12] +; CHECK-DAG: str r2, [sp, #8] ; CHECK: vldr d16, [sp, #8] define void @test_byval_usage_scheduling(i32 %n1, i32 %n2, %struct_t* byval %val) nounwind { entry: diff --git a/llvm/test/CodeGen/ARM/2014-02-21-byval-reg-split-alignment.ll b/llvm/test/CodeGen/ARM/2014-02-21-byval-reg-split-alignment.ll index 5b2fc57..f8c4d5d 100644 --- a/llvm/test/CodeGen/ARM/2014-02-21-byval-reg-split-alignment.ll +++ b/llvm/test/CodeGen/ARM/2014-02-21-byval-reg-split-alignment.ll @@ -35,8 +35,8 @@ define void @foo2(i32 %a, %struct8bytes8align* byval %b) { ; CHECK: sub sp, sp, #8 ; CHECK: push {r11, lr} ; CHECK: add r0, sp, #8 -; CHECK: str r3, [sp, #12] -; CHECK: str r2, [sp, #8] +; CHECK-DAG: str r3, [sp, #12] +; CHECK-DAG: str r2, [sp, #8] ; CHECK: bl usePtr ; CHECK: pop {r11, lr} ; CHECK: add sp, sp, #8 @@ -70,8 +70,8 @@ define void @foo4(%struct4bytes* byval %a, %struct8bytes8align* byval %b) { ; CHECK: push {r11, lr} ; CHECK: str r0, [sp, #8] ; CHECK: add r0, sp, #16 -; CHECK: str r3, [sp, #20] -; CHECK: str r2, [sp, #16] +; CHECK-DAG: str r3, [sp, #20] +; CHECK-DAG: str r2, [sp, #16] ; CHECK: bl usePtr ; CHECK: pop {r11, lr} ; CHECK: add sp, sp, #16 diff --git a/llvm/test/CodeGen/ARM/memset-inline.ll b/llvm/test/CodeGen/ARM/memset-inline.ll index 01b21e9..1b88539 100644 --- a/llvm/test/CodeGen/ARM/memset-inline.ll +++ b/llvm/test/CodeGen/ARM/memset-inline.ll @@ -25,12 +25,12 @@ entry: ; CHECK-7A: vst1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r2] ; CHECK-6M-LABEL: t2: ; CHECK-6M: movs [[REG:r[0-9]+]], #0 -; CHECK-6M: str [[REG]], [sp, #20] -; CHECK-6M: str [[REG]], [sp, #16] -; CHECK-6M: str [[REG]], [sp, #12] -; CHECK-6M: str [[REG]], [sp, #8] -; CHECK-6M: str [[REG]], [sp, #4] -; CHECK-6M: str [[REG]], [sp] +; CHECK-6M-DAG: str [[REG]], [sp, #20] +; CHECK-6M-DAG: str [[REG]], [sp, #16] +; CHECK-6M-DAG: str [[REG]], [sp, #12] +; CHECK-6M-DAG: str [[REG]], [sp, #8] +; CHECK-6M-DAG: str [[REG]], [sp, #4] +; CHECK-6M-DAG: str [[REG]], [sp] %buf = alloca [26 x i8], align 1 %0 = getelementptr inbounds [26 x i8], [26 x i8]* %buf, i32 0, i32 0 call void @llvm.memset.p0i8.i32(i8* %0, i8 0, i32 26, i1 false) diff --git a/llvm/test/CodeGen/ARM/thumb1_return_sequence.ll b/llvm/test/CodeGen/ARM/thumb1_return_sequence.ll index 11e18f1..a7f78c7 100644 --- a/llvm/test/CodeGen/ARM/thumb1_return_sequence.ll +++ b/llvm/test/CodeGen/ARM/thumb1_return_sequence.ll @@ -57,14 +57,14 @@ entry: ; Epilogue ; -------- -; CHECK-V4T: ldr [[POP:r[4567]]], [sp, #12] +; CHECK-V4T: ldr [[POP:r[4567]]], [sp, #16] ; CHECK-V4T-NEXT: mov lr, [[POP]] ; CHECK-V4T-NEXT: pop {[[SAVED]]} ; CHECK-V4T-NEXT: add sp, #16 ; CHECK-V4T-NEXT: bx lr ; CHECK-V5T: lsls r4 ; CHECK-V5T-NEXT: mov sp, r4 -; CHECK-V5T: ldr [[POP:r[4567]]], [sp, #12] +; CHECK-V5T: ldr [[POP:r[4567]]], [sp, #16] ; CHECK-V5T-NEXT: mov lr, [[POP]] ; CHECK-V5T-NEXT: pop {[[SAVED]]} ; CHECK-V5T-NEXT: add sp, #16 diff --git a/llvm/test/CodeGen/ARM/unaligned_load_store.ll b/llvm/test/CodeGen/ARM/unaligned_load_store.ll index 4e16bda..75098e1 100644 --- a/llvm/test/CodeGen/ARM/unaligned_load_store.ll +++ b/llvm/test/CodeGen/ARM/unaligned_load_store.ll @@ -13,14 +13,14 @@ define void @t(i8* nocapture %a, i8* nocapture %b) nounwind { entry: ; EXPANDED-LABEL: t: -; EXPANDED: ldrb [[R2:r[0-9]+]] -; EXPANDED: ldrb [[R3:r[0-9]+]] -; EXPANDED: ldrb [[R12:r[0-9]+]] -; EXPANDED: ldrb [[R1:r[0-9]+]] -; EXPANDED: strb [[R1]] -; EXPANDED: strb [[R12]] -; EXPANDED: strb [[R3]] -; EXPANDED: strb [[R2]] +; EXPANDED-DAG: ldrb [[R2:r[0-9]+]] +; EXPANDED-DAG: ldrb [[R3:r[0-9]+]] +; EXPANDED-DAG: ldrb [[R12:r[0-9]+]] +; EXPANDED-DAG: ldrb [[R1:r[0-9]+]] +; EXPANDED-DAG: strb [[R1]] +; EXPANDED-DAG: strb [[R12]] +; EXPANDED-DAG: strb [[R3]] +; EXPANDED-DAG: strb [[R2]] ; UNALIGNED-LABEL: t: ; UNALIGNED: ldr r1 diff --git a/llvm/test/CodeGen/AVR/calling-conv/c/basic.ll b/llvm/test/CodeGen/AVR/calling-conv/c/basic.ll index a5d4676..80a61a4 100644 --- a/llvm/test/CodeGen/AVR/calling-conv/c/basic.ll +++ b/llvm/test/CodeGen/AVR/calling-conv/c/basic.ll @@ -66,24 +66,24 @@ define void @ret_void_args_i64(i64 %a) { ; CHECK-LABEL: ret_void_args_i64_i64 define void @ret_void_args_i64_i64(i64 %a, i64 %b) { - ; CHECK: sts 11, r25 - ; CHECK-NEXT: sts 10, r24 - ; CHECK-NEXT: sts 9, r23 - ; CHECK-NEXT: sts 8, r22 - ; CHECK-NEXT: sts 7, r21 - ; CHECK-NEXT: sts 6, r20 - ; CHECK-NEXT: sts 5, r19 - ; CHECK-NEXT: sts 4, r18 + ; CHECK-DAG: sts 11, r25 + ; CHECK-DAG: sts 10, r24 + ; CHECK-DAG: sts 9, r23 + ; CHECK-DAG: sts 8, r22 + ; CHECK-DAG: sts 7, r21 + ; CHECK-DAG: sts 6, r20 + ; CHECK-DAG: sts 5, r19 + ; CHECK-DAG: sts 4, r18 store volatile i64 %a, i64* inttoptr (i64 4 to i64*) - ; CHECK-NEXT: sts 11, r17 - ; CHECK-NEXT: sts 10, r16 - ; CHECK-NEXT: sts 9, r15 - ; CHECK-NEXT: sts 8, r14 - ; CHECK-NEXT: sts 7, r13 - ; CHECK-NEXT: sts 6, r12 - ; CHECK-NEXT: sts 5, r11 - ; CHECK-NEXT: sts 4, r10 + ; CHECK-DAG: sts 11, r17 + ; CHECK-DAG: sts 10, r16 + ; CHECK-DAG: sts 9, r15 + ; CHECK-DAG: sts 8, r14 + ; CHECK-DAG: sts 7, r13 + ; CHECK-DAG: sts 6, r12 + ; CHECK-DAG: sts 5, r11 + ; CHECK-DAG: sts 4, r10 store volatile i64 %b, i64* inttoptr (i64 4 to i64*) ret void } diff --git a/llvm/test/CodeGen/AVR/directmem.ll b/llvm/test/CodeGen/AVR/directmem.ll index 6d2ddc5..6e1f72e 100644 --- a/llvm/test/CodeGen/AVR/directmem.ll +++ b/llvm/test/CodeGen/AVR/directmem.ll @@ -207,10 +207,10 @@ define i32 @static32_inc() { ; CHECK: sbci r23, 255 ; CHECK: sbci r24, 255 ; CHECK: sbci r25, 255 -; CHECK: sts long.static+3, r25 -; CHECK: sts long.static+2, r24 -; CHECK: sts long.static+1, r23 -; CHECK: sts long.static, r22 +; CHECK-DAG: sts long.static+3, r25 +; CHECK-DAG: sts long.static+2, r24 +; CHECK-DAG: sts long.static+1, r23 +; CHECK-DAG: sts long.static, r22 %1 = load i32, i32* @long.static %inc = add nsw i32 %1, 1 store i32 %inc, i32* @long.static @@ -309,14 +309,14 @@ define i64 @static64_inc() { ; CHECK: sbci r23, 255 ; CHECK: sbci r24, 255 ; CHECK: sbci r25, 255 -; CHECK: sts longlong.static+7, r25 -; CHECK: sts longlong.static+6, r24 -; CHECK: sts longlong.static+5, r23 -; CHECK: sts longlong.static+4, r22 -; CHECK: sts longlong.static+3, r21 -; CHECK: sts longlong.static+2, r20 -; CHECK: sts longlong.static+1, r19 -; CHECK: sts longlong.static, r18 +; CHECK-DAG: sts longlong.static+7, r25 +; CHECK-DAG: sts longlong.static+6, r24 +; CHECK-DAG: sts longlong.static+5, r23 +; CHECK-DAG: sts longlong.static+4, r22 +; CHECK-DAG: sts longlong.static+3, r21 +; CHECK-DAG: sts longlong.static+2, r20 +; CHECK-DAG: sts longlong.static+1, r19 +; CHECK-DAG: sts longlong.static, r18 %1 = load i64, i64* @longlong.static %inc = add nsw i64 %1, 1 store i64 %inc, i64* @longlong.static diff --git a/llvm/test/CodeGen/BPF/undef.ll b/llvm/test/CodeGen/BPF/undef.ll index 3736cb7..099c2f8 100644 --- a/llvm/test/CodeGen/BPF/undef.ll +++ b/llvm/test/CodeGen/BPF/undef.ll @@ -20,20 +20,20 @@ define i32 @ebpf_filter(%struct.__sk_buff* nocapture readnone %ebpf_packet) #0 s ; CHECK: *(u64 *)(r10 - 8) = r1 ; CHECK: r1 = 0 -; CHECK: *(u16 *)(r10 + 24) = r1 -; CHECK: *(u16 *)(r10 + 22) = r1 -; CHECK: *(u16 *)(r10 + 20) = r1 -; CHECK: *(u16 *)(r10 + 18) = r1 -; CHECK: *(u16 *)(r10 + 16) = r1 -; CHECK: *(u16 *)(r10 + 14) = r1 -; CHECK: *(u16 *)(r10 + 12) = r1 -; CHECK: *(u16 *)(r10 + 10) = r1 -; CHECK: *(u16 *)(r10 + 8) = r1 -; CHECK: *(u16 *)(r10 + 6) = r1 -; CHECK: *(u16 *)(r10 + 4) = r1 -; CHECK: *(u16 *)(r10 + 2) = r1 -; CHECK: *(u16 *)(r10 + 0) = r1 -; CHECK: *(u16 *)(r10 + 26) = r1 +; CHECK-DAG: *(u16 *)(r10 + 24) = r1 +; CHECK-DAG: *(u16 *)(r10 + 22) = r1 +; CHECK-DAG: *(u16 *)(r10 + 20) = r1 +; CHECK-DAG: *(u16 *)(r10 + 18) = r1 +; CHECK-DAG: *(u16 *)(r10 + 16) = r1 +; CHECK-DAG: *(u16 *)(r10 + 14) = r1 +; CHECK-DAG: *(u16 *)(r10 + 12) = r1 +; CHECK-DAG: *(u16 *)(r10 + 10) = r1 +; CHECK-DAG: *(u16 *)(r10 + 8) = r1 +; CHECK-DAG: *(u16 *)(r10 + 6) = r1 +; CHECK-DAG: *(u16 *)(r10 + 4) = r1 +; CHECK-DAG: *(u16 *)(r10 + 2) = r1 +; CHECK-DAG: *(u16 *)(r10 + 0) = r1 +; CHECK-DAG: *(u16 *)(r10 + 26) = r1 ; CHECK: r2 = r10 ; CHECK: r2 += -8 diff --git a/llvm/test/CodeGen/MSP430/cc_args.ll b/llvm/test/CodeGen/MSP430/cc_args.ll index c8164f1..6695a98 100644 --- a/llvm/test/CodeGen/MSP430/cc_args.ll +++ b/llvm/test/CodeGen/MSP430/cc_args.ll @@ -166,10 +166,10 @@ define void @f_i64_i64(i64 %a, i64 %b) #0 { ; CHECK: mov r13, &g_i64+2 ; CHECK: mov r12, &g_i64 store volatile i64 %a, i64* @g_i64, align 2 -; CHECK: mov 10(r4), &g_i64+6 -; CHECK: mov 8(r4), &g_i64+4 -; CHECK: mov 6(r4), &g_i64+2 -; CHECK: mov 4(r4), &g_i64 +; CHECK-DAG: mov 10(r4), &g_i64+6 +; CHECK-DAG: mov 8(r4), &g_i64+4 +; CHECK-DAG: mov 6(r4), &g_i64+2 +; CHECK-DAG: mov 4(r4), &g_i64 store volatile i64 %b, i64* @g_i64, align 2 ret void } diff --git a/llvm/test/CodeGen/Mips/v2i16tof32.ll b/llvm/test/CodeGen/Mips/v2i16tof32.ll index 7e5591e..334413b 100644 --- a/llvm/test/CodeGen/Mips/v2i16tof32.ll +++ b/llvm/test/CodeGen/Mips/v2i16tof32.ll @@ -15,23 +15,19 @@ define float @f(<8 x i16>* %a) { ; CHECK-NEXT: .cfi_def_cfa_register 30 ; CHECK-NEXT: addiu $1, $zero, -16 ; CHECK-NEXT: and $sp, $sp, $1 -; CHECK-NEXT: lw $1, 8($4) -; CHECK-NEXT: lw $2, 4($4) -; CHECK-NEXT: lw $3, 12($4) -; CHECK-NEXT: sw $3, 12($sp) -; CHECK-NEXT: sw $1, 8($sp) -; CHECK-NEXT: sw $2, 4($sp) -; CHECK-NEXT: lw $1, 0($4) -; CHECK-NEXT: sw $1, 0($sp) -; CHECK-NEXT: mtc1 $1, $f0 +; CHECK-NEXT: lw $1, 12($4) +; CHECK-NEXT: lw $2, 0($4) +; CHECK-NEXT: lw $3, 8($4) +; CHECK-NEXT: sw $3, 8($sp) +; CHECK-NEXT: sw $1, 12($sp) +; CHECK-NEXT: sw $2, 0($sp) +; CHECK-NEXT: lw $1, 4($4) +; CHECK-NEXT: sw $1, 4($sp) +; CHECK-NEXT: mtc1 $2, $f0 ; CHECK-NEXT: move $sp, $fp ; CHECK-NEXT: lw $fp, 28($sp) # 4-byte Folded Reload ; CHECK-NEXT: jr $ra ; CHECK-NEXT: addiu $sp, $sp, 32 -; CHECK-NEXT: .set at -; CHECK-NEXT: .set macro -; CHECK-NEXT: .set reorder -; CHECK-NEXT: .end f entry: %m = alloca <8 x i16> %0 = load <8 x i16>, <8 x i16>* %a diff --git a/llvm/test/CodeGen/PowerPC/f128-aggregates.ll b/llvm/test/CodeGen/PowerPC/f128-aggregates.ll index 9d16103..8a8c7f1 100644 --- a/llvm/test/CodeGen/PowerPC/f128-aggregates.ll +++ b/llvm/test/CodeGen/PowerPC/f128-aggregates.ll @@ -82,27 +82,27 @@ define fp128 @testStruct_03(%struct.With9fp128params* byval nocapture readonly align 16 %a) { ; CHECK-LABEL: testStruct_03: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: std r10, 88(r1) -; CHECK-NEXT: std r9, 80(r1) -; CHECK-NEXT: std r8, 72(r1) -; CHECK-NEXT: std r7, 64(r1) -; CHECK-NEXT: std r6, 56(r1) -; CHECK-NEXT: std r5, 48(r1) -; CHECK-NEXT: std r4, 40(r1) -; CHECK-NEXT: std r3, 32(r1) +; CHECK-DAG: std r10, 88(r1) +; CHECK-DAG: std r9, 80(r1) +; CHECK-DAG: std r8, 72(r1) +; CHECK-DAG: std r7, 64(r1) +; CHECK-DAG: std r6, 56(r1) +; CHECK-DAG: std r5, 48(r1) +; CHECK-DAG: std r4, 40(r1) +; CHECK-DAG: std r3, 32(r1) ; CHECK-NEXT: lxv v2, 128(r1) ; CHECK-NEXT: blr ; CHECK-BE-LABEL: testStruct_03: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: std r10, 104(r1) -; CHECK-BE-NEXT: std r9, 96(r1) -; CHECK-BE-NEXT: std r8, 88(r1) -; CHECK-BE-NEXT: std r7, 80(r1) -; CHECK-BE-NEXT: std r6, 72(r1) -; CHECK-BE-NEXT: std r5, 64(r1) -; CHECK-BE-NEXT: std r4, 56(r1) -; CHECK-BE-NEXT: std r3, 48(r1) +; CHECK-BE-DAG: std r10, 104(r1) +; CHECK-BE-DAG: std r9, 96(r1) +; CHECK-BE-DAG: std r8, 88(r1) +; CHECK-BE-DAG: std r7, 80(r1) +; CHECK-BE-DAG: std r6, 72(r1) +; CHECK-BE-DAG: std r5, 64(r1) +; CHECK-BE-DAG: std r4, 56(r1) +; CHECK-BE-DAG: std r3, 48(r1) ; CHECK-BE-NEXT: lxv v2, 144(r1) ; CHECK-BE-NEXT: blr entry: @@ -256,27 +256,27 @@ entry: define fp128 @testNestedAggregate(%struct.MixedC* byval nocapture readonly align 16 %a) { ; CHECK-LABEL: testNestedAggregate: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: std r8, 72(r1) -; CHECK-NEXT: std r7, 64(r1) -; CHECK-NEXT: std r10, 88(r1) -; CHECK-NEXT: std r9, 80(r1) -; CHECK-NEXT: std r6, 56(r1) -; CHECK-NEXT: std r5, 48(r1) -; CHECK-NEXT: std r4, 40(r1) -; CHECK-NEXT: std r3, 32(r1) +; CHECK-DAG: std r10, 88(r1) +; CHECK-DAG: std r9, 80(r1) +; CHECK-DAG: std r8, 72(r1) +; CHECK-DAG: std r7, 64(r1) +; CHECK-DAG: std r6, 56(r1) +; CHECK-DAG: std r5, 48(r1) +; CHECK-DAG: std r4, 40(r1) +; CHECK-DAG: std r3, 32(r1) ; CHECK-NEXT: lxv v2, 64(r1) ; CHECK-NEXT: blr ; CHECK-BE-LABEL: testNestedAggregate: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: std r8, 88(r1) -; CHECK-BE-NEXT: std r7, 80(r1) -; CHECK-BE-NEXT: std r10, 104(r1) -; CHECK-BE-NEXT: std r9, 96(r1) -; CHECK-BE-NEXT: std r6, 72(r1) -; CHECK-BE-NEXT: std r5, 64(r1) -; CHECK-BE-NEXT: std r4, 56(r1) -; CHECK-BE-NEXT: std r3, 48(r1) +; CHECK-BE-DAG: std r8, 88(r1) +; CHECK-BE-DAG: std r7, 80(r1) +; CHECK-BE-DAG: std r10, 104(r1) +; CHECK-BE-DAG: std r9, 96(r1) +; CHECK-BE-DAG: std r6, 72(r1) +; CHECK-BE-DAG: std r5, 64(r1) +; CHECK-BE-DAG: std r4, 56(r1) +; CHECK-BE-DAG: std r3, 48(r1) ; CHECK-BE-NEXT: lxv v2, 80(r1) ; CHECK-BE-NEXT: blr entry: @@ -337,17 +337,17 @@ entry: define fp128 @sum_float128(i32 signext %count, ...) { ; CHECK-LABEL: sum_float128: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: std r10, 88(r1) -; CHECK-NEXT: std r9, 80(r1) -; CHECK-NEXT: std r8, 72(r1) -; CHECK-NEXT: std r7, 64(r1) -; CHECK-NEXT: std r6, 56(r1) -; CHECK-NEXT: cmpwi cr0, r3, 1 -; CHECK-NEXT: std r4, 40(r1) -; CHECK-NEXT: addis [[REG:r[0-9]+]], r2, .LCPI17_0@toc@ha -; CHECK-NEXT: addi [[REG1:r[0-9]+]], [[REG]], .LCPI17_0@toc@l -; CHECK-NEXT: lxvx v2, 0, [[REG1]] -; CHECK-NEXT: std r5, 48(r1) +; CHECK-DAG: std r10, 88(r1) +; CHECK-DAG: std r9, 80(r1) +; CHECK-DAG: std r8, 72(r1) +; CHECK-DAG: std r7, 64(r1) +; CHECK-DAG: std r6, 56(r1) +; CHECK-DAG: std r4, 40(r1) +; CHECK-DAG: cmpwi cr0, r3, 1 +; CHECK-DAG: std r5, 48(r1) +; CHECK-DAG: addis [[REG:r[0-9]+]], r2, .LCPI17_0@toc@ha +; CHECK-DAG: addi [[REG1:r[0-9]+]], [[REG]], .LCPI17_0@toc@l +; CHECK-DAG: lxvx v2, 0, [[REG1]] ; CHECK-NEXT: bltlr cr0 ; CHECK-NEXT: # %bb.1: # %if.end ; CHECK-NEXT: addi r3, r1, 40 diff --git a/llvm/test/CodeGen/PowerPC/ppc64-byval-align.ll b/llvm/test/CodeGen/PowerPC/ppc64-byval-align.ll index f91da59..db0cd86 100644 --- a/llvm/test/CodeGen/PowerPC/ppc64-byval-align.ll +++ b/llvm/test/CodeGen/PowerPC/ppc64-byval-align.ll @@ -34,7 +34,7 @@ entry: ret i64 %0 } ; CHECK-LABEL: @callee2 -; CHECK: ld 3, 128(1) +; CHECK: ld {{[0-9]+}}, 128(1) ; CHECK: blr declare i64 @test2(%struct.pad* byval, i32 signext, %struct.test* byval align 16) diff --git a/llvm/test/CodeGen/Thumb/frame-access.ll b/llvm/test/CodeGen/Thumb/frame-access.ll index 9cbed5ed..a9d2999 100644 --- a/llvm/test/CodeGen/Thumb/frame-access.ll +++ b/llvm/test/CodeGen/Thumb/frame-access.ll @@ -173,9 +173,9 @@ entry: ; Setup frame pointer ; CHECK: add r7, sp, #8 ; Register varargs stored via FP -; CHECK: str r3, [r7, #16] -; CHECK-NEXT: str r2, [r7, #12] -; CHECK-NEXT: str r1, [r7, #8] +; CHECK-DAG: str r3, [r7, #16] +; CHECK-DAG: str r2, [r7, #12] +; CHECK-DAG: str r1, [r7, #8] ; Moving SP, access via SP ; int test_args_moving_sp(int a, int b, int c, int d, int e) { diff --git a/llvm/test/CodeGen/Thumb/mvn.ll b/llvm/test/CodeGen/Thumb/mvn.ll index 1e16eff..a108bfd 100644 --- a/llvm/test/CodeGen/Thumb/mvn.ll +++ b/llvm/test/CodeGen/Thumb/mvn.ll @@ -194,26 +194,26 @@ for.cond.cleanup: define void @test128(i128* %a) { ; CHECK-LABEL: test128: -; CHECK: ldr r1, [r0, #4] +; CHECK: ldr r1, [r0, #8] ; CHECK-NEXT: ldr r2, .LCPI8_0 ; CHECK-NEXT: eors r2, r1 -; CHECK-NEXT: str r2, [r0, #4] +; CHECK-NEXT: str r2, [r0, #8] ; CHECK-NEXT: ldr r1, [r0] ; CHECK-NEXT: ldr r2, .LCPI8_1 ; CHECK-NEXT: eors r2, r1 ; CHECK-NEXT: str r2, [r0] -; CHECK-NEXT: ldr r1, [r0, #8] +; CHECK-NEXT: ldr r1, [r0, #4] ; CHECK-NEXT: ldr r2, .LCPI8_2 ; CHECK-NEXT: eors r2, r1 -; CHECK-NEXT: str r2, [r0, #8] +; CHECK-NEXT: str r2, [r0, #4] ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: .LCPI8_0: -; CHECK-NEXT: .long 4075008415 +; CHECK-NEXT: .long 6692605 ; CHECK-NEXT: .LCPI8_1: ; CHECK-NEXT: .long 2080661269 ; CHECK-NEXT: .LCPI8_2: -; CHECK-NEXT: .long 6692605 +; CHECK-NEXT: .long 4075008415 %x = load i128, i128* %a %xn = xor i128 %x, 123456789123456789123456789 store i128 %xn, i128* %a diff --git a/llvm/test/CodeGen/X86/aligned-variadic.ll b/llvm/test/CodeGen/X86/aligned-variadic.ll index 1ea5729..d827444 100644 --- a/llvm/test/CodeGen/X86/aligned-variadic.ll +++ b/llvm/test/CodeGen/X86/aligned-variadic.ll @@ -17,7 +17,7 @@ entry: store i8* %overflow_arg_area.next, i8** %overflow_arg_area_p, align 8 ; X32: leal 68(%esp), [[REG:%.*]] ; X32: movl [[REG]], 16(%esp) -; X64: leaq 232(%rsp), [[REG:%.*]] +; X64: leaq 256(%rsp), [[REG:%.*]] ; X64: movq [[REG]], 184(%rsp) ; X64: leaq 176(%rsp), %rdi call void @qux(%struct.__va_list_tag* %arraydecay) diff --git a/llvm/test/CodeGen/X86/atomic-idempotent.ll b/llvm/test/CodeGen/X86/atomic-idempotent.ll index c67a926..e7e2430 100644 --- a/llvm/test/CodeGen/X86/atomic-idempotent.ll +++ b/llvm/test/CodeGen/X86/atomic-idempotent.ll @@ -132,10 +132,10 @@ define i128 @or128(i128* %p) { ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-NEXT: movl %edi, 12(%esi) -; X32-NEXT: movl %edx, 8(%esi) -; X32-NEXT: movl %ecx, 4(%esi) +; X32-NEXT: movl %edi, 8(%esi) +; X32-NEXT: movl %edx, 12(%esi) ; X32-NEXT: movl %eax, (%esi) +; X32-NEXT: movl %ecx, 4(%esi) ; X32-NEXT: movl %esi, %eax ; X32-NEXT: leal -8(%ebp), %esp ; X32-NEXT: popl %esi diff --git a/llvm/test/CodeGen/X86/avx-load-store.ll b/llvm/test/CodeGen/X86/avx-load-store.ll index eabe82d..1b28c72 100644 --- a/llvm/test/CodeGen/X86/avx-load-store.ll +++ b/llvm/test/CodeGen/X86/avx-load-store.ll @@ -245,8 +245,8 @@ define void @add8i32(<8 x i32>* %ret, <8 x i32>* %bp) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: vmovups (%rsi), %xmm0 ; CHECK-NEXT: vmovups 16(%rsi), %xmm1 -; CHECK-NEXT: vmovups %xmm0, (%rdi) ; CHECK-NEXT: vmovups %xmm1, 16(%rdi) +; CHECK-NEXT: vmovups %xmm0, (%rdi) ; CHECK-NEXT: retq ; ; CHECK_O0-LABEL: add8i32: @@ -290,8 +290,8 @@ define void @add4i64a16(<4 x i64>* %ret, <4 x i64>* %bp) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: vmovaps (%rsi), %xmm0 ; CHECK-NEXT: vmovaps 16(%rsi), %xmm1 -; CHECK-NEXT: vmovaps %xmm0, (%rdi) ; CHECK-NEXT: vmovaps %xmm1, 16(%rdi) +; CHECK-NEXT: vmovaps %xmm0, (%rdi) ; CHECK-NEXT: retq ; ; CHECK_O0-LABEL: add4i64a16: diff --git a/llvm/test/CodeGen/X86/btc_bts_btr.ll b/llvm/test/CodeGen/X86/btc_bts_btr.ll index 6f43cf7..5e64be9 100644 --- a/llvm/test/CodeGen/X86/btc_bts_btr.ll +++ b/llvm/test/CodeGen/X86/btc_bts_btr.ll @@ -859,8 +859,8 @@ define void @btr_64_dont_fold(i64* %x, i64 %n) { ; X86-NEXT: .LBB33_2: ; X86-NEXT: notl %esi ; X86-NEXT: notl %edx -; X86-NEXT: andl %esi, 4(%eax) ; X86-NEXT: andl %edx, (%eax) +; X86-NEXT: andl %esi, 4(%eax) ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl @@ -899,8 +899,8 @@ define void @bts_64_dont_fold(i64* %x, i64 %n) { ; X86-NEXT: movl %edx, %esi ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: .LBB34_2: -; X86-NEXT: orl %esi, 4(%eax) ; X86-NEXT: orl %edx, (%eax) +; X86-NEXT: orl %esi, 4(%eax) ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl @@ -938,8 +938,8 @@ define void @btc_64_dont_fold(i64* %x, i64 %n) { ; X86-NEXT: movl %edx, %esi ; X86-NEXT: xorl %edx, %edx ; X86-NEXT: .LBB35_2: -; X86-NEXT: xorl %esi, 4(%eax) ; X86-NEXT: xorl %edx, (%eax) +; X86-NEXT: xorl %esi, 4(%eax) ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/combine-sbb.ll b/llvm/test/CodeGen/X86/combine-sbb.ll index fcdba85..43011b0 100644 --- a/llvm/test/CodeGen/X86/combine-sbb.ll +++ b/llvm/test/CodeGen/X86/combine-sbb.ll @@ -77,8 +77,8 @@ define void @PR25858_i64(%WideUInt64* sret, %WideUInt64*, %WideUInt64*) nounwind ; X86-NEXT: movzbl %bl, %ecx ; X86-NEXT: subl %ecx, %edx ; X86-NEXT: sbbl $0, %ebp -; X86-NEXT: movl %edi, 4(%eax) ; X86-NEXT: movl %esi, (%eax) +; X86-NEXT: movl %edi, 4(%eax) ; X86-NEXT: movl %edx, 8(%eax) ; X86-NEXT: movl %ebp, 12(%eax) ; X86-NEXT: popl %esi diff --git a/llvm/test/CodeGen/X86/constant-combines.ll b/llvm/test/CodeGen/X86/constant-combines.ll index f3d2df6..20fbedb 100644 --- a/llvm/test/CodeGen/X86/constant-combines.ll +++ b/llvm/test/CodeGen/X86/constant-combines.ll @@ -19,7 +19,7 @@ define void @PR22524({ float, float }* %arg) { ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: xorps %xmm1, %xmm1 ; CHECK-NEXT: mulss %xmm0, %xmm1 -; CHECK-NEXT: movq $0, (%rdi) +; CHECK-NEXT: movl $0, (%rdi) ; CHECK-NEXT: movss %xmm1, 4(%rdi) ; CHECK-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll index 1a28852..e5ff601 100644 --- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll +++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll @@ -8,10 +8,10 @@ define void @add256(<16 x i32>* %a, <16 x i32>* %b, <16 x i32>* %c) "min-legal-v ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm0 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 -; CHECK-NEXT: vpaddd (%rsi), %ymm0, %ymm0 ; CHECK-NEXT: vpaddd 32(%rsi), %ymm1, %ymm1 -; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) +; CHECK-NEXT: vpaddd (%rsi), %ymm0, %ymm0 ; CHECK-NEXT: vmovdqa %ymm0, (%rdx) +; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %d = load <16 x i32>, <16 x i32>* %a @@ -85,10 +85,10 @@ define void @pmaddwd_32_256(<32 x i16>* %APtr, <32 x i16>* %BPtr, <16 x i32>* %C ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm0 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 -; CHECK-NEXT: vpmaddwd (%rsi), %ymm0, %ymm0 ; CHECK-NEXT: vpmaddwd 32(%rsi), %ymm1, %ymm1 -; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) +; CHECK-NEXT: vpmaddwd (%rsi), %ymm0, %ymm0 ; CHECK-NEXT: vmovdqa %ymm0, (%rdx) +; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %A = load <32 x i16>, <32 x i16>* %APtr @@ -128,10 +128,10 @@ define void @psubus_64i8_max_256(<64 x i8>* %xptr, <64 x i8>* %yptr, <64 x i8>* ; CHECK: # %bb.0: ; CHECK-NEXT: vmovdqa (%rdi), %ymm0 ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 -; CHECK-NEXT: vpsubusb (%rsi), %ymm0, %ymm0 ; CHECK-NEXT: vpsubusb 32(%rsi), %ymm1, %ymm1 -; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) +; CHECK-NEXT: vpsubusb (%rsi), %ymm0, %ymm0 ; CHECK-NEXT: vmovdqa %ymm0, (%rdx) +; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %x = load <64 x i8>, <64 x i8>* %xptr @@ -652,27 +652,27 @@ define void @mul256(<64 x i8>* %a, <64 x i8>* %b, <64 x i8>* %c) "min-legal-vect ; CHECK-NEXT: vmovdqa 32(%rdi), %ymm1 ; CHECK-NEXT: vmovdqa (%rsi), %ymm2 ; CHECK-NEXT: vmovdqa 32(%rsi), %ymm3 -; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31] -; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] +; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] ; CHECK-NEXT: vpmullw %ymm4, %ymm5, %ymm4 ; CHECK-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] ; CHECK-NEXT: vpand %ymm5, %ymm4, %ymm4 -; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] -; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] -; CHECK-NEXT: vpmullw %ymm2, %ymm0, %ymm0 -; CHECK-NEXT: vpand %ymm5, %ymm0, %ymm0 -; CHECK-NEXT: vpackuswb %ymm4, %ymm0, %ymm0 -; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15],ymm3[24],ymm0[24],ymm3[25],ymm0[25],ymm3[26],ymm0[26],ymm3[27],ymm0[27],ymm3[28],ymm0[28],ymm3[29],ymm0[29],ymm3[30],ymm0[30],ymm3[31],ymm0[31] -; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31] -; CHECK-NEXT: vpmullw %ymm2, %ymm4, %ymm2 -; CHECK-NEXT: vpand %ymm5, %ymm2, %ymm2 ; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] ; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] ; CHECK-NEXT: vpmullw %ymm3, %ymm1, %ymm1 ; CHECK-NEXT: vpand %ymm5, %ymm1, %ymm1 -; CHECK-NEXT: vpackuswb %ymm2, %ymm1, %ymm1 -; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) +; CHECK-NEXT: vpackuswb %ymm4, %ymm1, %ymm1 +; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15],ymm2[24],ymm0[24],ymm2[25],ymm0[25],ymm2[26],ymm0[26],ymm2[27],ymm0[27],ymm2[28],ymm0[28],ymm2[29],ymm0[29],ymm2[30],ymm0[30],ymm2[31],ymm0[31] +; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; CHECK-NEXT: vpmullw %ymm3, %ymm4, %ymm3 +; CHECK-NEXT: vpand %ymm5, %ymm3, %ymm3 +; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[16],ymm0[16],ymm2[17],ymm0[17],ymm2[18],ymm0[18],ymm2[19],ymm0[19],ymm2[20],ymm0[20],ymm2[21],ymm0[21],ymm2[22],ymm0[22],ymm2[23],ymm0[23] +; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; CHECK-NEXT: vpmullw %ymm2, %ymm0, %ymm0 +; CHECK-NEXT: vpand %ymm5, %ymm0, %ymm0 +; CHECK-NEXT: vpackuswb %ymm3, %ymm0, %ymm0 ; CHECK-NEXT: vmovdqa %ymm0, (%rdx) +; CHECK-NEXT: vmovdqa %ymm1, 32(%rdx) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq %d = load <64 x i8>, <64 x i8>* %a diff --git a/llvm/test/CodeGen/X86/musttail-varargs.ll b/llvm/test/CodeGen/X86/musttail-varargs.ll index 6a338c5c..b62343f 100644 --- a/llvm/test/CodeGen/X86/musttail-varargs.ll +++ b/llvm/test/CodeGen/X86/musttail-varargs.ll @@ -56,11 +56,11 @@ define void @f_thunk(i8* %this, ...) { ; LINUX-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp) ; LINUX-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp) ; LINUX-NEXT: .LBB0_2: -; LINUX-NEXT: movq %r15, {{[0-9]+}}(%rsp) -; LINUX-NEXT: movq %r12, {{[0-9]+}}(%rsp) -; LINUX-NEXT: movq %r13, {{[0-9]+}}(%rsp) -; LINUX-NEXT: movq %rbp, {{[0-9]+}}(%rsp) ; LINUX-NEXT: movq %rbx, {{[0-9]+}}(%rsp) +; LINUX-NEXT: movq %rbp, {{[0-9]+}}(%rsp) +; LINUX-NEXT: movq %r13, {{[0-9]+}}(%rsp) +; LINUX-NEXT: movq %r12, {{[0-9]+}}(%rsp) +; LINUX-NEXT: movq %r15, {{[0-9]+}}(%rsp) ; LINUX-NEXT: leaq {{[0-9]+}}(%rsp), %rax ; LINUX-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; LINUX-NEXT: leaq {{[0-9]+}}(%rsp), %rax @@ -150,11 +150,11 @@ define void @f_thunk(i8* %this, ...) { ; LINUX-X32-NEXT: movaps %xmm6, {{[0-9]+}}(%esp) ; LINUX-X32-NEXT: movaps %xmm7, {{[0-9]+}}(%esp) ; LINUX-X32-NEXT: .LBB0_2: -; LINUX-X32-NEXT: movq %r15, {{[0-9]+}}(%esp) -; LINUX-X32-NEXT: movq %r12, {{[0-9]+}}(%esp) -; LINUX-X32-NEXT: movq %r13, {{[0-9]+}}(%esp) -; LINUX-X32-NEXT: movq %rbp, {{[0-9]+}}(%esp) ; LINUX-X32-NEXT: movq %rbx, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movq %rbp, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movq %r13, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movq %r12, {{[0-9]+}}(%esp) +; LINUX-X32-NEXT: movq %r15, {{[0-9]+}}(%esp) ; LINUX-X32-NEXT: leal {{[0-9]+}}(%rsp), %eax ; LINUX-X32-NEXT: movl %eax, {{[0-9]+}}(%esp) ; LINUX-X32-NEXT: leal {{[0-9]+}}(%rsp), %eax @@ -223,9 +223,9 @@ define void @f_thunk(i8* %this, ...) { ; WINDOWS-NEXT: movq %r8, %rdi ; WINDOWS-NEXT: movq %rdx, %rbx ; WINDOWS-NEXT: movq %rcx, %rbp -; WINDOWS-NEXT: movq %r9, {{[0-9]+}}(%rsp) -; WINDOWS-NEXT: movq %r8, {{[0-9]+}}(%rsp) ; WINDOWS-NEXT: movq %rdx, {{[0-9]+}}(%rsp) +; WINDOWS-NEXT: movq %r8, {{[0-9]+}}(%rsp) +; WINDOWS-NEXT: movq %r9, {{[0-9]+}}(%rsp) ; WINDOWS-NEXT: leaq {{[0-9]+}}(%rsp), %rax ; WINDOWS-NEXT: movq %rax, {{[0-9]+}}(%rsp) ; WINDOWS-NEXT: callq get_f diff --git a/llvm/test/CodeGen/X86/musttail.ll b/llvm/test/CodeGen/X86/musttail.ll index 927322b..6192d31 100644 --- a/llvm/test/CodeGen/X86/musttail.ll +++ b/llvm/test/CodeGen/X86/musttail.ll @@ -46,8 +46,8 @@ define i32 @t4({}* %fn, i32 %n, i32 %r) { ; CHECK-LABEL: t4: ; CHECK: incl %[[r:.*]] ; CHECK: decl %[[n:.*]] -; CHECK: movl %[[r]], {{[0-9]+}}(%esp) -; CHECK: movl %[[n]], {{[0-9]+}}(%esp) +; CHECK-DAG: movl %[[r]], {{[0-9]+}}(%esp) +; CHECK-DAG: movl %[[n]], {{[0-9]+}}(%esp) ; CHECK: jmpl *%{{.*}} entry: @@ -71,8 +71,8 @@ define i32 @t5({}* %fn, i32 %n, i32 %r) alignstack(32) { ; CHECK: incl %[[r:.*]] ; CHECK: decl %[[n:.*]] ; Store them through ebp, since that's the only stable arg pointer. -; CHECK: movl %[[r]], {{[0-9]+}}(%ebp) -; CHECK: movl %[[n]], {{[0-9]+}}(%ebp) +; CHECK-DAG: movl %[[r]], {{[0-9]+}}(%ebp) +; CHECK-DAG: movl %[[n]], {{[0-9]+}}(%ebp) ; Epilogue. ; CHECK: leal {{[-0-9]+}}(%ebp), %esp ; CHECK: popl %esi diff --git a/llvm/test/CodeGen/X86/nosse-vector.ll b/llvm/test/CodeGen/X86/nosse-vector.ll index ec97b1e..ef2b40a 100644 --- a/llvm/test/CodeGen/X86/nosse-vector.ll +++ b/llvm/test/CodeGen/X86/nosse-vector.ll @@ -146,7 +146,7 @@ define void @sitofp_4i64_4f32_mem(<4 x i64>* %p0, <4 x float>* %p1) nounwind { ; X32-NEXT: subl $48, %esp ; X32-NEXT: movl 8(%ebp), %eax ; X32-NEXT: movl 24(%eax), %ecx -; X32-NEXT: movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill +; X32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X32-NEXT: movl 28(%eax), %ecx ; X32-NEXT: movl %ecx, (%esp) # 4-byte Spill ; X32-NEXT: movl 16(%eax), %esi @@ -163,7 +163,7 @@ define void @sitofp_4i64_4f32_mem(<4 x i64>* %p0, <4 x float>* %p1) nounwind { ; X32-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X32-NEXT: movl (%esp), %eax # 4-byte Reload ; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax # 4-byte Reload +; X32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X32-NEXT: movl 12(%ebp), %eax ; X32-NEXT: fildll {{[0-9]+}}(%esp) @@ -277,10 +277,10 @@ define void @add_2i64_mem(<2 x i64>* %p0, <2 x i64>* %p1, <2 x i64>* %p2) nounwi ; X32-NEXT: adcl 4(%ecx), %edx ; X32-NEXT: addl 8(%ecx), %edi ; X32-NEXT: adcl 12(%ecx), %esi -; X32-NEXT: movl %esi, 12(%eax) ; X32-NEXT: movl %edi, 8(%eax) -; X32-NEXT: movl %edx, 4(%eax) +; X32-NEXT: movl %esi, 12(%eax) ; X32-NEXT: movl %ebx, (%eax) +; X32-NEXT: movl %edx, 4(%eax) ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi ; X32-NEXT: popl %ebx diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll index 5f15f88..a4b8f58 100644 --- a/llvm/test/CodeGen/X86/oddshuffles.ll +++ b/llvm/test/CodeGen/X86/oddshuffles.ll @@ -1497,111 +1497,111 @@ define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2 define void @interleave_24i32_in(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2, <8 x i32>* %q3) nounwind { ; SSE2-LABEL: interleave_24i32_in: ; SSE2: # %bb.0: -; SSE2-NEXT: movups (%rsi), %xmm5 -; SSE2-NEXT: movups 16(%rsi), %xmm8 -; SSE2-NEXT: movups (%rdx), %xmm6 -; SSE2-NEXT: movups 16(%rdx), %xmm3 -; SSE2-NEXT: movups (%rcx), %xmm0 -; SSE2-NEXT: movups 16(%rcx), %xmm4 -; SSE2-NEXT: movaps %xmm0, %xmm7 -; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,0],xmm5[1,0] -; SSE2-NEXT: movaps %xmm5, %xmm1 -; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm7[0,2] +; SSE2-NEXT: movups (%rsi), %xmm1 +; SSE2-NEXT: movups 16(%rsi), %xmm0 +; SSE2-NEXT: movups (%rdx), %xmm8 +; SSE2-NEXT: movups 16(%rdx), %xmm5 +; SSE2-NEXT: movups (%rcx), %xmm3 +; SSE2-NEXT: movups 16(%rcx), %xmm6 +; SSE2-NEXT: movaps %xmm3, %xmm7 +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,0],xmm1[1,0] +; SSE2-NEXT: movaps %xmm1, %xmm9 +; SSE2-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1] +; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm7[0,2] ; SSE2-NEXT: movaps %xmm5, %xmm7 -; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm6[2,1] -; SSE2-NEXT: movaps %xmm0, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm6[1,0] +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,2],xmm6[3,2] +; SSE2-NEXT: movaps %xmm6, %xmm4 +; SSE2-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm7[0,2] +; SSE2-NEXT: movaps %xmm0, %xmm7 +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm5[2,1] +; SSE2-NEXT: movaps %xmm6, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm5[1,0] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm7[0,2] -; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,2],xmm0[3,2] -; SSE2-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm6[0,2] -; SSE2-NEXT: movaps %xmm4, %xmm5 -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0],xmm8[1,0] -; SSE2-NEXT: movaps %xmm8, %xmm6 -; SSE2-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm5[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,0],xmm0[1,0] +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0,2] ; SSE2-NEXT: movaps %xmm8, %xmm5 -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[2,1] -; SSE2-NEXT: movaps %xmm4, %xmm7 -; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm3[1,0] -; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm5[0,2] -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,2],xmm4[3,2] -; SSE2-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm8[2],xmm4[3],xmm8[3] -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm3[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,2],xmm3[3,2] +; SSE2-NEXT: movaps %xmm3, %xmm6 +; SSE2-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm5[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm8[2,1] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm8[1,0] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[0,2] +; SSE2-NEXT: movups %xmm3, 16(%rdi) +; SSE2-NEXT: movups %xmm6, 32(%rdi) +; SSE2-NEXT: movups %xmm0, 48(%rdi) +; SSE2-NEXT: movups %xmm2, 64(%rdi) ; SSE2-NEXT: movups %xmm4, 80(%rdi) -; SSE2-NEXT: movups %xmm7, 64(%rdi) -; SSE2-NEXT: movups %xmm6, 48(%rdi) -; SSE2-NEXT: movups %xmm0, 32(%rdi) -; SSE2-NEXT: movups %xmm2, 16(%rdi) -; SSE2-NEXT: movups %xmm1, (%rdi) +; SSE2-NEXT: movups %xmm9, (%rdi) ; SSE2-NEXT: retq ; ; SSE42-LABEL: interleave_24i32_in: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqu (%rsi), %xmm5 -; SSE42-NEXT: movdqu 16(%rsi), %xmm2 -; SSE42-NEXT: movdqu (%rdx), %xmm6 -; SSE42-NEXT: movdqu 16(%rdx), %xmm1 -; SSE42-NEXT: movdqu (%rcx), %xmm7 -; SSE42-NEXT: movdqu 16(%rcx), %xmm4 -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,1,1] -; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,1,0,1] -; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3],xmm3[4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm7[0,1,0,1] -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5],xmm3[6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm6[1,1,2,2] -; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3],xmm3[4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm5[4,5],xmm3[6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1] -; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm7[2,3,2,3] -; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm5[2,3],xmm7[4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm6[2,2,3,3] -; SSE42-NEXT: pblendw {{.*#+}} xmm5 = xmm7[0,1,2,3],xmm5[4,5],xmm7[6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm1[0,0,1,1] -; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm2[0,1,0,1] -; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm6[2,3],xmm7[4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm4[0,1,0,1] -; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4,5],xmm7[6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm1[1,1,2,2] -; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm4[2,3],xmm7[4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm2[4,5],xmm7[6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] -; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3],xmm4[4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4,5],xmm4[6,7] -; SSE42-NEXT: movdqu %xmm1, 80(%rdi) -; SSE42-NEXT: movdqu %xmm7, 64(%rdi) -; SSE42-NEXT: movdqu %xmm6, 48(%rdi) -; SSE42-NEXT: movdqu %xmm5, 32(%rdi) -; SSE42-NEXT: movdqu %xmm3, 16(%rdi) -; SSE42-NEXT: movdqu %xmm0, (%rdi) +; SSE42-NEXT: movdqu (%rsi), %xmm8 +; SSE42-NEXT: movdqu 16(%rsi), %xmm4 +; SSE42-NEXT: movdqu (%rdx), %xmm2 +; SSE42-NEXT: movdqu 16(%rdx), %xmm5 +; SSE42-NEXT: movdqu (%rcx), %xmm3 +; SSE42-NEXT: movdqu 16(%rcx), %xmm6 +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,1,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm8[0,1,0,1] +; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,3],xmm7[4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,1,0,1] +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm7[0,1,2,3],xmm1[4,5],xmm7[6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm4[2,3,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,2,3] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3],xmm0[4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm7 = xmm5[2,2,3,3] +; SSE42-NEXT: pblendw {{.*#+}} xmm7 = xmm0[0,1,2,3],xmm7[4,5],xmm0[6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,2,2] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5],xmm0[6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,1,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] +; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm5 = xmm6[0,1,0,1] +; SSE42-NEXT: pblendw {{.*#+}} xmm5 = xmm4[0,1,2,3],xmm5[4,5],xmm4[6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm8[2,3,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3] +; SSE42-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm4[2,3],xmm6[4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,2,3,3] +; SSE42-NEXT: pblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5],xmm6[6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,2,2] +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm8[4,5],xmm2[6,7] +; SSE42-NEXT: movdqu %xmm2, 16(%rdi) +; SSE42-NEXT: movdqu %xmm4, 32(%rdi) +; SSE42-NEXT: movdqu %xmm5, 48(%rdi) +; SSE42-NEXT: movdqu %xmm0, 64(%rdi) +; SSE42-NEXT: movdqu %xmm7, 80(%rdi) +; SSE42-NEXT: movdqu %xmm1, (%rdi) ; SSE42-NEXT: retq ; ; AVX1-LABEL: interleave_24i32_in: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovupd (%rsi), %ymm0 ; AVX1-NEXT: vmovupd (%rcx), %ymm1 -; AVX1-NEXT: vmovups (%rdx), %xmm2 -; AVX1-NEXT: vmovups 16(%rdx), %xmm3 -; AVX1-NEXT: vmovups (%rsi), %xmm4 -; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm4[2,0],xmm2[2,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm2[1,1],xmm5[0,2] -; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,0],xmm4[0,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[2,1] +; AVX1-NEXT: vmovups 16(%rcx), %xmm2 +; AVX1-NEXT: vmovups (%rdx), %xmm3 +; AVX1-NEXT: vmovups 16(%rdx), %xmm4 +; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm4[3,0],xmm2[3,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm2[2,1],xmm5[0,2] +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],xmm4[1,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[2,2] ; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 -; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = xmm1[0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 -; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] -; AVX1-NEXT: vmovups 16(%rcx), %xmm4 -; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm3[3,0],xmm4[3,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm4[2,1],xmm5[0,2] -; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,0],xmm3[1,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,0],xmm3[2,2] -; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 ; AVX1-NEXT: vpermilpd {{.*#+}} ymm4 = ymm0[1,1,3,3] ; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] +; AVX1-NEXT: vmovups (%rsi), %xmm4 +; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm4[2,0],xmm3[2,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm5 = xmm3[1,1],xmm5[0,2] +; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,0],xmm4[0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[2,1] +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = xmm1[0,0] +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 ; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] ; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2] ; AVX1-NEXT: vpermilpd {{.*#+}} ymm1 = ymm1[1,1,2,2] @@ -1609,8 +1609,8 @@ define void @interleave_24i32_in(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2, ; AVX1-NEXT: vpermilps {{.*#+}} ymm1 = mem[0,0,3,3,4,4,7,7] ; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; AVX1-NEXT: vmovups %ymm0, 32(%rdi) -; AVX1-NEXT: vmovups %ymm3, 64(%rdi) -; AVX1-NEXT: vmovups %ymm2, (%rdi) +; AVX1-NEXT: vmovups %ymm3, (%rdi) +; AVX1-NEXT: vmovups %ymm2, 64(%rdi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1653,19 +1653,19 @@ define void @interleave_24i32_in(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2, ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5,6],ymm4[7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[0,3,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] -; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm4 = [1,0,2,2,1,0,2,2] -; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm4 -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[0,0,2,1] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7] -; AVX2-FAST-NEXT: vbroadcastsd (%rcx), %ymm5 -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm4 = ymm0[1,1,2,2] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2] -; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5,6],ymm0[7] -; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm1 = ymm1[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7] +; AVX2-FAST-NEXT: vpermilps {{.*#+}} ymm4 = ymm1[0,0,3,3,4,4,7,7] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] +; AVX2-FAST-NEXT: vmovaps {{.*#+}} ymm4 = [1,0,2,2,1,0,2,2] +; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm1 +; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,0,2,1] +; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] +; AVX2-FAST-NEXT: vbroadcastsd (%rcx), %ymm1 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] -; AVX2-FAST-NEXT: vmovups %ymm0, 32(%rdi) -; AVX2-FAST-NEXT: vmovups %ymm4, (%rdi) +; AVX2-FAST-NEXT: vmovups %ymm0, (%rdi) +; AVX2-FAST-NEXT: vmovups %ymm2, 32(%rdi) ; AVX2-FAST-NEXT: vmovups %ymm3, 64(%rdi) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq @@ -1674,32 +1674,32 @@ define void @interleave_24i32_in(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2, ; XOP: # %bb.0: ; XOP-NEXT: vmovupd (%rsi), %ymm0 ; XOP-NEXT: vmovups (%rcx), %ymm1 -; XOP-NEXT: vmovups (%rdx), %xmm2 -; XOP-NEXT: vmovups 16(%rdx), %xmm3 -; XOP-NEXT: vmovups (%rsi), %xmm4 -; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm4[2,0],xmm2[2,0] -; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm2[1,1],xmm5[0,2] -; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,0],xmm4[0,0] -; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[2,1] +; XOP-NEXT: vmovups 16(%rcx), %xmm2 +; XOP-NEXT: vmovups (%rdx), %xmm3 +; XOP-NEXT: vmovups 16(%rdx), %xmm4 +; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm4[3,0],xmm2[3,0] +; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm2[2,1],xmm5[0,2] +; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],xmm4[1,0] +; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[2,2] ; XOP-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 -; XOP-NEXT: vmovddup {{.*#+}} xmm4 = xmm1[0,0] -; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 -; XOP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] -; XOP-NEXT: vmovups 16(%rcx), %xmm4 -; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm3[3,0],xmm4[3,0] -; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm4[2,1],xmm5[0,2] -; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,0],xmm3[1,0] -; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,0],xmm3[2,2] -; XOP-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 ; XOP-NEXT: vpermilpd {{.*#+}} ymm4 = ymm0[1,1,3,3] ; XOP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] +; XOP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] +; XOP-NEXT: vmovups (%rsi), %xmm4 +; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm4[2,0],xmm3[2,0] +; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm3[1,1],xmm5[0,2] +; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,0],xmm4[0,0] +; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[2,0],xmm4[2,1] +; XOP-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 +; XOP-NEXT: vmovddup {{.*#+}} xmm4 = xmm1[0,0] +; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 ; XOP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] ; XOP-NEXT: vpermil2ps {{.*#+}} ymm0 = ymm1[2],ymm0[3],ymm1[2,3],ymm0[4],ymm1[5,4],ymm0[5] ; XOP-NEXT: vpermilps {{.*#+}} ymm1 = mem[0,0,3,3,4,4,7,7] ; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] ; XOP-NEXT: vmovups %ymm0, 32(%rdi) -; XOP-NEXT: vmovups %ymm3, 64(%rdi) -; XOP-NEXT: vmovups %ymm2, (%rdi) +; XOP-NEXT: vmovups %ymm3, (%rdi) +; XOP-NEXT: vmovups %ymm2, 64(%rdi) ; XOP-NEXT: vzeroupper ; XOP-NEXT: retq %s1 = load <8 x i32>, <8 x i32>* %q1, align 4 diff --git a/llvm/test/CodeGen/X86/pr40631_deadstore_elision.ll b/llvm/test/CodeGen/X86/pr40631_deadstore_elision.ll index c742ce4..f330e0f 100644 --- a/llvm/test/CodeGen/X86/pr40631_deadstore_elision.ll +++ b/llvm/test/CodeGen/X86/pr40631_deadstore_elision.ll @@ -12,13 +12,12 @@ define i32 @ipt_do_table(%struct.sk_buff* noalias nocapture readonly) { ; CHECK-NEXT: movq (%rdi), %rax ; CHECK-NEXT: xorps %xmm0, %xmm0 ; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movq $0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movq $170, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movaps {{.*#+}} xmm0 = [12297829382473034410,12297829382473034410] ; CHECK-NEXT: movaps %xmm0, (%rsp) ; CHECK-NEXT: movabsq $-6148914691236517206, %rcx # imm = 0xAAAAAAAAAAAAAAAA ; CHECK-NEXT: movq %rcx, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; CHECK-NEXT: movb $-86, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movzwl 2(%rax), %ecx ; CHECK-NEXT: andl $8191, %ecx # imm = 0x1FFF ; CHECK-NEXT: movl %ecx, {{[0-9]+}}(%rsp) diff --git a/llvm/test/CodeGen/X86/rotate.ll b/llvm/test/CodeGen/X86/rotate.ll index 0d92e26..d2ecc28 100644 --- a/llvm/test/CodeGen/X86/rotate.ll +++ b/llvm/test/CodeGen/X86/rotate.ll @@ -572,8 +572,8 @@ define void @rotr1_64_mem(i64* %Aptr) nounwind { ; X86-NEXT: movl %edx, %esi ; X86-NEXT: shldl $31, %ecx, %esi ; X86-NEXT: shldl $31, %edx, %ecx -; X86-NEXT: movl %ecx, 4(%eax) ; X86-NEXT: movl %esi, (%eax) +; X86-NEXT: movl %ecx, 4(%eax) ; X86-NEXT: popl %esi ; X86-NEXT: retl ; diff --git a/llvm/test/CodeGen/X86/rotate4.ll b/llvm/test/CodeGen/X86/rotate4.ll index 7f5e426..fa7f550 100644 --- a/llvm/test/CodeGen/X86/rotate4.ll +++ b/llvm/test/CodeGen/X86/rotate4.ll @@ -244,32 +244,32 @@ define void @rotate_left_m64(i64 *%pa, i64 %b) { ; X86-NEXT: .cfi_offset %ebp, -8 ; X86-NEXT: movb {{[0-9]+}}(%esp), %cl ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl (%eax), %edx +; X86-NEXT: movl (%eax), %esi ; X86-NEXT: movl 4(%eax), %ebx -; X86-NEXT: movl %edx, %esi -; X86-NEXT: shll %cl, %esi +; X86-NEXT: movl %esi, %edx +; X86-NEXT: shll %cl, %edx ; X86-NEXT: movl %ebx, %edi -; X86-NEXT: shldl %cl, %edx, %edi +; X86-NEXT: shldl %cl, %esi, %edi ; X86-NEXT: testb $32, %cl ; X86-NEXT: je .LBB6_2 ; X86-NEXT: # %bb.1: -; X86-NEXT: movl %esi, %edi -; X86-NEXT: xorl %esi, %esi +; X86-NEXT: movl %edx, %edi +; X86-NEXT: xorl %edx, %edx ; X86-NEXT: .LBB6_2: ; X86-NEXT: negb %cl ; X86-NEXT: movl %ebx, %ebp ; X86-NEXT: shrl %cl, %ebp -; X86-NEXT: shrdl %cl, %ebx, %edx +; X86-NEXT: shrdl %cl, %ebx, %esi ; X86-NEXT: testb $32, %cl ; X86-NEXT: je .LBB6_4 ; X86-NEXT: # %bb.3: -; X86-NEXT: movl %ebp, %edx +; X86-NEXT: movl %ebp, %esi ; X86-NEXT: xorl %ebp, %ebp ; X86-NEXT: .LBB6_4: +; X86-NEXT: orl %esi, %edx ; X86-NEXT: orl %ebp, %edi -; X86-NEXT: orl %edx, %esi +; X86-NEXT: movl %edx, (%eax) ; X86-NEXT: movl %edi, 4(%eax) -; X86-NEXT: movl %esi, (%eax) ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 16 ; X86-NEXT: popl %edi @@ -336,10 +336,10 @@ define void @rotate_right_m64(i64 *%pa, i64 %b) { ; X86-NEXT: movl %ebp, %esi ; X86-NEXT: xorl %ebp, %ebp ; X86-NEXT: .LBB7_4: -; X86-NEXT: orl %esi, %edx ; X86-NEXT: orl %ebp, %edi -; X86-NEXT: movl %edx, 4(%eax) +; X86-NEXT: orl %esi, %edx ; X86-NEXT: movl %edi, (%eax) +; X86-NEXT: movl %edx, 4(%eax) ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 16 ; X86-NEXT: popl %edi diff --git a/llvm/test/CodeGen/X86/sadd_sat_vec.ll b/llvm/test/CodeGen/X86/sadd_sat_vec.ll index 320cc07..186141a 100644 --- a/llvm/test/CodeGen/X86/sadd_sat_vec.ll +++ b/llvm/test/CodeGen/X86/sadd_sat_vec.ll @@ -460,10 +460,10 @@ define void @v12i16(<12 x i16>* %px, <12 x i16>* %py, <12 x i16>* %pz) nounwind ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vpaddsw (%rsi), %xmm0, %xmm0 ; AVX1-NEXT: vpaddsw 16(%rsi), %xmm1, %xmm1 -; AVX1-NEXT: vmovq %xmm1, 16(%rdx) +; AVX1-NEXT: vpaddsw (%rsi), %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, (%rdx) +; AVX1-NEXT: vmovq %xmm1, 16(%rdx) ; AVX1-NEXT: retq ; ; AVX2-LABEL: v12i16: diff --git a/llvm/test/CodeGen/X86/shift-and.ll b/llvm/test/CodeGen/X86/shift-and.ll index fc8eb2f..00dc635 100644 --- a/llvm/test/CodeGen/X86/shift-and.ll +++ b/llvm/test/CodeGen/X86/shift-and.ll @@ -144,8 +144,8 @@ define void @t5ptr(i64 %t, i64* %ptr) nounwind { ; X32-NEXT: movl %esi, %edx ; X32-NEXT: xorl %esi, %esi ; X32-NEXT: .LBB5_2: -; X32-NEXT: movl %esi, 4(%eax) ; X32-NEXT: movl %edx, (%eax) +; X32-NEXT: movl %esi, 4(%eax) ; X32-NEXT: popl %esi ; X32-NEXT: popl %edi ; X32-NEXT: retl diff --git a/llvm/test/CodeGen/X86/shrink_vmul-widen.ll b/llvm/test/CodeGen/X86/shrink_vmul-widen.ll index d0fad23..0ed79ea 100644 --- a/llvm/test/CodeGen/X86/shrink_vmul-widen.ll +++ b/llvm/test/CodeGen/X86/shrink_vmul-widen.ll @@ -746,18 +746,18 @@ define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i6 ; X86-SSE-NEXT: pmulhuw %xmm0, %xmm4 ; X86-SSE-NEXT: pmullw %xmm0, %xmm2 ; X86-SSE-NEXT: movdqa %xmm2, %xmm0 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; X86-SSE-NEXT: movdqa %xmm3, %xmm4 ; X86-SSE-NEXT: pmulhuw %xmm1, %xmm4 ; X86-SSE-NEXT: pmullw %xmm1, %xmm3 ; X86-SSE-NEXT: movdqa %xmm3, %xmm1 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; X86-SSE-NEXT: movdqu %xmm3, 48(%esi,%ecx,4) -; X86-SSE-NEXT: movdqu %xmm1, 32(%esi,%ecx,4) -; X86-SSE-NEXT: movdqu %xmm2, 16(%esi,%ecx,4) -; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4) +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; X86-SSE-NEXT: movdqu %xmm3, 32(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm1, 48(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm2, (%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm0, 16(%esi,%ecx,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl ; @@ -818,18 +818,18 @@ define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i6 ; X64-SSE-NEXT: pmulhuw %xmm0, %xmm4 ; X64-SSE-NEXT: pmullw %xmm0, %xmm2 ; X64-SSE-NEXT: movdqa %xmm2, %xmm0 -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; X64-SSE-NEXT: movdqa %xmm3, %xmm4 ; X64-SSE-NEXT: pmulhuw %xmm1, %xmm4 ; X64-SSE-NEXT: pmullw %xmm1, %xmm3 ; X64-SSE-NEXT: movdqa %xmm3, %xmm1 -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; X64-SSE-NEXT: movdqu %xmm3, 48(%rax,%rdx,4) -; X64-SSE-NEXT: movdqu %xmm1, 32(%rax,%rdx,4) -; X64-SSE-NEXT: movdqu %xmm2, 16(%rax,%rdx,4) -; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4) +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; X64-SSE-NEXT: movdqu %xmm3, 32(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm1, 48(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm2, (%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm0, 16(%rax,%rdx,4) ; X64-SSE-NEXT: retq ; ; X64-AVX1-LABEL: mul_16xi16: @@ -1262,18 +1262,18 @@ define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly % ; X86-SSE-NEXT: pmulhw %xmm0, %xmm4 ; X86-SSE-NEXT: pmullw %xmm0, %xmm2 ; X86-SSE-NEXT: movdqa %xmm2, %xmm0 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; X86-SSE-NEXT: movdqa %xmm3, %xmm4 ; X86-SSE-NEXT: pmulhw %xmm1, %xmm4 ; X86-SSE-NEXT: pmullw %xmm1, %xmm3 ; X86-SSE-NEXT: movdqa %xmm3, %xmm1 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; X86-SSE-NEXT: movdqu %xmm3, 48(%esi,%ecx,4) -; X86-SSE-NEXT: movdqu %xmm1, 32(%esi,%ecx,4) -; X86-SSE-NEXT: movdqu %xmm2, 16(%esi,%ecx,4) -; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4) +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; X86-SSE-NEXT: movdqu %xmm3, 32(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm1, 48(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm2, (%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm0, 16(%esi,%ecx,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl ; @@ -1334,18 +1334,18 @@ define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly % ; X64-SSE-NEXT: pmulhw %xmm0, %xmm4 ; X64-SSE-NEXT: pmullw %xmm0, %xmm2 ; X64-SSE-NEXT: movdqa %xmm2, %xmm0 -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; X64-SSE-NEXT: movdqa %xmm3, %xmm4 ; X64-SSE-NEXT: pmulhw %xmm1, %xmm4 ; X64-SSE-NEXT: pmullw %xmm1, %xmm3 ; X64-SSE-NEXT: movdqa %xmm3, %xmm1 -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; X64-SSE-NEXT: movdqu %xmm3, 48(%rax,%rdx,4) -; X64-SSE-NEXT: movdqu %xmm1, 32(%rax,%rdx,4) -; X64-SSE-NEXT: movdqu %xmm2, 16(%rax,%rdx,4) -; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4) +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; X64-SSE-NEXT: movdqu %xmm3, 32(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm1, 48(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm2, (%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm0, 16(%rax,%rdx,4) ; X64-SSE-NEXT: retq ; ; X64-AVX1-LABEL: mul_16xi16_sext: diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll index 8a8a396..85ffce8 100644 --- a/llvm/test/CodeGen/X86/shrink_vmul.ll +++ b/llvm/test/CodeGen/X86/shrink_vmul.ll @@ -740,18 +740,18 @@ define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i6 ; X86-SSE-NEXT: pmulhuw %xmm0, %xmm4 ; X86-SSE-NEXT: pmullw %xmm0, %xmm2 ; X86-SSE-NEXT: movdqa %xmm2, %xmm0 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; X86-SSE-NEXT: movdqa %xmm3, %xmm4 ; X86-SSE-NEXT: pmulhuw %xmm1, %xmm4 ; X86-SSE-NEXT: pmullw %xmm1, %xmm3 ; X86-SSE-NEXT: movdqa %xmm3, %xmm1 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; X86-SSE-NEXT: movdqu %xmm3, 48(%esi,%ecx,4) -; X86-SSE-NEXT: movdqu %xmm1, 32(%esi,%ecx,4) -; X86-SSE-NEXT: movdqu %xmm2, 16(%esi,%ecx,4) -; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4) +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; X86-SSE-NEXT: movdqu %xmm3, 32(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm1, 48(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm2, (%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm0, 16(%esi,%ecx,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl ; @@ -812,18 +812,18 @@ define void @mul_16xi16(i8* nocapture readonly %a, i8* nocapture readonly %b, i6 ; X64-SSE-NEXT: pmulhuw %xmm0, %xmm4 ; X64-SSE-NEXT: pmullw %xmm0, %xmm2 ; X64-SSE-NEXT: movdqa %xmm2, %xmm0 -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; X64-SSE-NEXT: movdqa %xmm3, %xmm4 ; X64-SSE-NEXT: pmulhuw %xmm1, %xmm4 ; X64-SSE-NEXT: pmullw %xmm1, %xmm3 ; X64-SSE-NEXT: movdqa %xmm3, %xmm1 -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; X64-SSE-NEXT: movdqu %xmm3, 48(%rax,%rdx,4) -; X64-SSE-NEXT: movdqu %xmm1, 32(%rax,%rdx,4) -; X64-SSE-NEXT: movdqu %xmm2, 16(%rax,%rdx,4) -; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4) +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; X64-SSE-NEXT: movdqu %xmm3, 32(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm1, 48(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm2, (%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm0, 16(%rax,%rdx,4) ; X64-SSE-NEXT: retq ; ; X64-AVX1-LABEL: mul_16xi16: @@ -1240,18 +1240,18 @@ define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly % ; X86-SSE-NEXT: pmulhw %xmm0, %xmm4 ; X86-SSE-NEXT: pmullw %xmm0, %xmm2 ; X86-SSE-NEXT: movdqa %xmm2, %xmm0 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; X86-SSE-NEXT: movdqa %xmm3, %xmm4 ; X86-SSE-NEXT: pmulhw %xmm1, %xmm4 ; X86-SSE-NEXT: pmullw %xmm1, %xmm3 ; X86-SSE-NEXT: movdqa %xmm3, %xmm1 -; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; X86-SSE-NEXT: movdqu %xmm3, 48(%esi,%ecx,4) -; X86-SSE-NEXT: movdqu %xmm1, 32(%esi,%ecx,4) -; X86-SSE-NEXT: movdqu %xmm2, 16(%esi,%ecx,4) -; X86-SSE-NEXT: movdqu %xmm0, (%esi,%ecx,4) +; X86-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; X86-SSE-NEXT: movdqu %xmm3, 32(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm1, 48(%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm2, (%esi,%ecx,4) +; X86-SSE-NEXT: movdqu %xmm0, 16(%esi,%ecx,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl ; @@ -1312,18 +1312,18 @@ define void @mul_16xi16_sext(i8* nocapture readonly %a, i8* nocapture readonly % ; X64-SSE-NEXT: pmulhw %xmm0, %xmm4 ; X64-SSE-NEXT: pmullw %xmm0, %xmm2 ; X64-SSE-NEXT: movdqa %xmm2, %xmm0 -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] ; X64-SSE-NEXT: movdqa %xmm3, %xmm4 ; X64-SSE-NEXT: pmulhw %xmm1, %xmm4 ; X64-SSE-NEXT: pmullw %xmm1, %xmm3 ; X64-SSE-NEXT: movdqa %xmm3, %xmm1 -; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; X64-SSE-NEXT: movdqu %xmm3, 48(%rax,%rdx,4) -; X64-SSE-NEXT: movdqu %xmm1, 32(%rax,%rdx,4) -; X64-SSE-NEXT: movdqu %xmm2, 16(%rax,%rdx,4) -; X64-SSE-NEXT: movdqu %xmm0, (%rax,%rdx,4) +; X64-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] +; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; X64-SSE-NEXT: movdqu %xmm3, 32(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm1, 48(%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm2, (%rax,%rdx,4) +; X64-SSE-NEXT: movdqu %xmm0, 16(%rax,%rdx,4) ; X64-SSE-NEXT: retq ; ; X64-AVX1-LABEL: mul_16xi16_sext: diff --git a/llvm/test/CodeGen/X86/ssub_sat_vec.ll b/llvm/test/CodeGen/X86/ssub_sat_vec.ll index 5ba7b17..b9adde1 100644 --- a/llvm/test/CodeGen/X86/ssub_sat_vec.ll +++ b/llvm/test/CodeGen/X86/ssub_sat_vec.ll @@ -460,10 +460,10 @@ define void @v12i16(<12 x i16>* %px, <12 x i16>* %py, <12 x i16>* %pz) nounwind ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vpsubsw (%rsi), %xmm0, %xmm0 ; AVX1-NEXT: vpsubsw 16(%rsi), %xmm1, %xmm1 -; AVX1-NEXT: vmovq %xmm1, 16(%rdx) +; AVX1-NEXT: vpsubsw (%rsi), %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, (%rdx) +; AVX1-NEXT: vmovq %xmm1, 16(%rdx) ; AVX1-NEXT: retq ; ; AVX2-LABEL: v12i16: diff --git a/llvm/test/CodeGen/X86/uadd_sat_vec.ll b/llvm/test/CodeGen/X86/uadd_sat_vec.ll index c52c489..83fe8c1 100644 --- a/llvm/test/CodeGen/X86/uadd_sat_vec.ll +++ b/llvm/test/CodeGen/X86/uadd_sat_vec.ll @@ -460,10 +460,10 @@ define void @v12i16(<12 x i16>* %px, <12 x i16>* %py, <12 x i16>* %pz) nounwind ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vpaddusw (%rsi), %xmm0, %xmm0 ; AVX1-NEXT: vpaddusw 16(%rsi), %xmm1, %xmm1 -; AVX1-NEXT: vmovq %xmm1, 16(%rdx) +; AVX1-NEXT: vpaddusw (%rsi), %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, (%rdx) +; AVX1-NEXT: vmovq %xmm1, 16(%rdx) ; AVX1-NEXT: retq ; ; AVX2-LABEL: v12i16: diff --git a/llvm/test/CodeGen/X86/usub_sat_vec.ll b/llvm/test/CodeGen/X86/usub_sat_vec.ll index 72c0c51..e3d47d2 100644 --- a/llvm/test/CodeGen/X86/usub_sat_vec.ll +++ b/llvm/test/CodeGen/X86/usub_sat_vec.ll @@ -460,10 +460,10 @@ define void @v12i16(<12 x i16>* %px, <12 x i16>* %py, <12 x i16>* %pz) nounwind ; AVX1: # %bb.0: ; AVX1-NEXT: vmovdqa (%rdi), %xmm0 ; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX1-NEXT: vpsubusw (%rsi), %xmm0, %xmm0 ; AVX1-NEXT: vpsubusw 16(%rsi), %xmm1, %xmm1 -; AVX1-NEXT: vmovq %xmm1, 16(%rdx) +; AVX1-NEXT: vpsubusw (%rsi), %xmm0, %xmm0 ; AVX1-NEXT: vmovdqa %xmm0, (%rdx) +; AVX1-NEXT: vmovq %xmm1, 16(%rdx) ; AVX1-NEXT: retq ; ; AVX2-LABEL: v12i16: diff --git a/llvm/test/CodeGen/X86/vastart-defs-eflags.ll b/llvm/test/CodeGen/X86/vastart-defs-eflags.ll index 6ef6915..00e605a 100644 --- a/llvm/test/CodeGen/X86/vastart-defs-eflags.ll +++ b/llvm/test/CodeGen/X86/vastart-defs-eflags.ll @@ -21,11 +21,11 @@ define i32 @check_flag(i32 %flags, ...) nounwind { ; CHECK-NEXT: movaps %xmm6, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movaps %xmm7, {{[0-9]+}}(%rsp) ; CHECK-NEXT: LBB0_2: ## %entry -; CHECK-NEXT: movq %r9, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; CHECK-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movq %rsi, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movq %r8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testl $512, %edi ## imm = 0x200 ; CHECK-NEXT: je LBB0_4 diff --git a/llvm/test/CodeGen/X86/vec_fpext.ll b/llvm/test/CodeGen/X86/vec_fpext.ll index b66d5d1..3007c8d 100644 --- a/llvm/test/CodeGen/X86/vec_fpext.ll +++ b/llvm/test/CodeGen/X86/vec_fpext.ll @@ -186,14 +186,14 @@ define void @fpext_frommem8(<8 x float>* %in, <8 x double>* %out) { ; X32-SSE: # %bb.0: # %entry ; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] ; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04] -; X32-SSE-NEXT: cvtps2pd (%ecx), %xmm0 # encoding: [0x0f,0x5a,0x01] -; X32-SSE-NEXT: cvtps2pd 8(%ecx), %xmm1 # encoding: [0x0f,0x5a,0x49,0x08] -; X32-SSE-NEXT: cvtps2pd 16(%ecx), %xmm2 # encoding: [0x0f,0x5a,0x51,0x10] -; X32-SSE-NEXT: cvtps2pd 24(%ecx), %xmm3 # encoding: [0x0f,0x5a,0x59,0x18] -; X32-SSE-NEXT: movups %xmm3, 48(%eax) # encoding: [0x0f,0x11,0x58,0x30] -; X32-SSE-NEXT: movups %xmm2, 32(%eax) # encoding: [0x0f,0x11,0x50,0x20] -; X32-SSE-NEXT: movups %xmm1, 16(%eax) # encoding: [0x0f,0x11,0x48,0x10] -; X32-SSE-NEXT: movups %xmm0, (%eax) # encoding: [0x0f,0x11,0x00] +; X32-SSE-NEXT: cvtps2pd 8(%ecx), %xmm0 # encoding: [0x0f,0x5a,0x41,0x08] +; X32-SSE-NEXT: cvtps2pd (%ecx), %xmm1 # encoding: [0x0f,0x5a,0x09] +; X32-SSE-NEXT: cvtps2pd 24(%ecx), %xmm2 # encoding: [0x0f,0x5a,0x51,0x18] +; X32-SSE-NEXT: cvtps2pd 16(%ecx), %xmm3 # encoding: [0x0f,0x5a,0x59,0x10] +; X32-SSE-NEXT: movups %xmm3, 32(%eax) # encoding: [0x0f,0x11,0x58,0x20] +; X32-SSE-NEXT: movups %xmm2, 48(%eax) # encoding: [0x0f,0x11,0x50,0x30] +; X32-SSE-NEXT: movups %xmm1, (%eax) # encoding: [0x0f,0x11,0x08] +; X32-SSE-NEXT: movups %xmm0, 16(%eax) # encoding: [0x0f,0x11,0x40,0x10] ; X32-SSE-NEXT: retl # encoding: [0xc3] ; ; X32-AVX-LABEL: fpext_frommem8: @@ -218,14 +218,14 @@ define void @fpext_frommem8(<8 x float>* %in, <8 x double>* %out) { ; ; X64-SSE-LABEL: fpext_frommem8: ; X64-SSE: # %bb.0: # %entry -; X64-SSE-NEXT: cvtps2pd (%rdi), %xmm0 # encoding: [0x0f,0x5a,0x07] -; X64-SSE-NEXT: cvtps2pd 8(%rdi), %xmm1 # encoding: [0x0f,0x5a,0x4f,0x08] -; X64-SSE-NEXT: cvtps2pd 16(%rdi), %xmm2 # encoding: [0x0f,0x5a,0x57,0x10] -; X64-SSE-NEXT: cvtps2pd 24(%rdi), %xmm3 # encoding: [0x0f,0x5a,0x5f,0x18] -; X64-SSE-NEXT: movups %xmm3, 48(%rsi) # encoding: [0x0f,0x11,0x5e,0x30] -; X64-SSE-NEXT: movups %xmm2, 32(%rsi) # encoding: [0x0f,0x11,0x56,0x20] -; X64-SSE-NEXT: movups %xmm1, 16(%rsi) # encoding: [0x0f,0x11,0x4e,0x10] -; X64-SSE-NEXT: movups %xmm0, (%rsi) # encoding: [0x0f,0x11,0x06] +; X64-SSE-NEXT: cvtps2pd 8(%rdi), %xmm0 # encoding: [0x0f,0x5a,0x47,0x08] +; X64-SSE-NEXT: cvtps2pd (%rdi), %xmm1 # encoding: [0x0f,0x5a,0x0f] +; X64-SSE-NEXT: cvtps2pd 24(%rdi), %xmm2 # encoding: [0x0f,0x5a,0x57,0x18] +; X64-SSE-NEXT: cvtps2pd 16(%rdi), %xmm3 # encoding: [0x0f,0x5a,0x5f,0x10] +; X64-SSE-NEXT: movups %xmm3, 32(%rsi) # encoding: [0x0f,0x11,0x5e,0x20] +; X64-SSE-NEXT: movups %xmm2, 48(%rsi) # encoding: [0x0f,0x11,0x56,0x30] +; X64-SSE-NEXT: movups %xmm1, (%rsi) # encoding: [0x0f,0x11,0x0e] +; X64-SSE-NEXT: movups %xmm0, 16(%rsi) # encoding: [0x0f,0x11,0x46,0x10] ; X64-SSE-NEXT: retq # encoding: [0xc3] ; ; X64-AVX-LABEL: fpext_frommem8: diff --git a/llvm/test/CodeGen/X86/widen_cast-2.ll b/llvm/test/CodeGen/X86/widen_cast-2.ll index 03d4700..e778091 100644 --- a/llvm/test/CodeGen/X86/widen_cast-2.ll +++ b/llvm/test/CodeGen/X86/widen_cast-2.ll @@ -21,9 +21,9 @@ define void @convert(<7 x i32>* %dst, <14 x i16>* %src) nounwind { ; CHECK-NEXT: movdqa 16(%edx,%eax), %xmm2 ; CHECK-NEXT: psubw %xmm0, %xmm1 ; CHECK-NEXT: psubw %xmm0, %xmm2 -; CHECK-NEXT: pextrd $2, %xmm2, 24(%ecx,%eax) -; CHECK-NEXT: pextrd $1, %xmm2, 20(%ecx,%eax) ; CHECK-NEXT: movd %xmm2, 16(%ecx,%eax) +; CHECK-NEXT: pextrd $1, %xmm2, 20(%ecx,%eax) +; CHECK-NEXT: pextrd $2, %xmm2, 24(%ecx,%eax) ; CHECK-NEXT: movdqa %xmm1, (%ecx,%eax) ; CHECK-NEXT: incl (%esp) ; CHECK-NEXT: cmpl $3, (%esp) diff --git a/llvm/test/CodeGen/X86/widen_load-2.ll b/llvm/test/CodeGen/X86/widen_load-2.ll index 5147db0..23b68b2 100644 --- a/llvm/test/CodeGen/X86/widen_load-2.ll +++ b/llvm/test/CodeGen/X86/widen_load-2.ll @@ -47,8 +47,8 @@ define void @add3i32_2(%i32vec3* sret %ret, %i32vec3* %ap, %i32vec3* %bp) { ; X86-NEXT: pinsrd $1, 4(%ecx), %xmm1 ; X86-NEXT: pinsrd $2, 8(%ecx), %xmm1 ; X86-NEXT: paddd %xmm0, %xmm1 -; X86-NEXT: pextrd $2, %xmm1, 8(%eax) ; X86-NEXT: pextrd $1, %xmm1, 4(%eax) +; X86-NEXT: pextrd $2, %xmm1, 8(%eax) ; X86-NEXT: movd %xmm1, (%eax) ; X86-NEXT: retl $4 ; @@ -81,9 +81,9 @@ define void @add7i32(%i32vec7* sret %ret, %i32vec7* %ap, %i32vec7* %bp) { ; X86-NEXT: movdqa 16(%edx), %xmm1 ; X86-NEXT: paddd (%ecx), %xmm0 ; X86-NEXT: paddd 16(%ecx), %xmm1 -; X86-NEXT: pextrd $2, %xmm1, 24(%eax) -; X86-NEXT: pextrd $1, %xmm1, 20(%eax) ; X86-NEXT: movd %xmm1, 16(%eax) +; X86-NEXT: pextrd $1, %xmm1, 20(%eax) +; X86-NEXT: pextrd $2, %xmm1, 24(%eax) ; X86-NEXT: movdqa %xmm0, (%eax) ; X86-NEXT: retl $4 ; @@ -94,8 +94,8 @@ define void @add7i32(%i32vec7* sret %ret, %i32vec7* %ap, %i32vec7* %bp) { ; X64-NEXT: movdqa 16(%rsi), %xmm1 ; X64-NEXT: paddd (%rdx), %xmm0 ; X64-NEXT: paddd 16(%rdx), %xmm1 -; X64-NEXT: pextrd $2, %xmm1, 24(%rdi) ; X64-NEXT: movq %xmm1, 16(%rdi) +; X64-NEXT: pextrd $2, %xmm1, 24(%rdi) ; X64-NEXT: movdqa %xmm0, (%rdi) ; X64-NEXT: retq %a = load %i32vec7, %i32vec7* %ap, align 16 @@ -116,10 +116,10 @@ define void @add12i32(%i32vec12* sret %ret, %i32vec12* %ap, %i32vec12* %bp) { ; X86-NEXT: movdqa (%edx), %xmm1 ; X86-NEXT: movdqa 16(%edx), %xmm2 ; X86-NEXT: paddd (%ecx), %xmm1 -; X86-NEXT: paddd 16(%ecx), %xmm2 ; X86-NEXT: paddd 32(%ecx), %xmm0 -; X86-NEXT: movdqa %xmm0, 32(%eax) +; X86-NEXT: paddd 16(%ecx), %xmm2 ; X86-NEXT: movdqa %xmm2, 16(%eax) +; X86-NEXT: movdqa %xmm0, 32(%eax) ; X86-NEXT: movdqa %xmm1, (%eax) ; X86-NEXT: retl $4 ; @@ -130,10 +130,10 @@ define void @add12i32(%i32vec12* sret %ret, %i32vec12* %ap, %i32vec12* %bp) { ; X64-NEXT: movdqa 16(%rsi), %xmm1 ; X64-NEXT: movdqa 32(%rsi), %xmm2 ; X64-NEXT: paddd (%rdx), %xmm0 -; X64-NEXT: paddd 16(%rdx), %xmm1 ; X64-NEXT: paddd 32(%rdx), %xmm2 -; X64-NEXT: movdqa %xmm2, 32(%rdi) +; X64-NEXT: paddd 16(%rdx), %xmm1 ; X64-NEXT: movdqa %xmm1, 16(%rdi) +; X64-NEXT: movdqa %xmm2, 32(%rdi) ; X64-NEXT: movdqa %xmm0, (%rdi) ; X64-NEXT: retq %a = load %i32vec12, %i32vec12* %ap, align 16 @@ -225,8 +225,8 @@ define void @add12i16(%i16vec12* nocapture sret %ret, %i16vec12* %ap, %i16vec12* ; X86-NEXT: movdqa 16(%edx), %xmm1 ; X86-NEXT: paddw (%ecx), %xmm0 ; X86-NEXT: paddw 16(%ecx), %xmm1 -; X86-NEXT: pextrd $1, %xmm1, 20(%eax) ; X86-NEXT: movd %xmm1, 16(%eax) +; X86-NEXT: pextrd $1, %xmm1, 20(%eax) ; X86-NEXT: movdqa %xmm0, (%eax) ; X86-NEXT: retl $4 ; @@ -258,10 +258,10 @@ define void @add18i16(%i16vec18* nocapture sret %ret, %i16vec18* %ap, %i16vec18* ; X86-NEXT: movdqa (%edx), %xmm1 ; X86-NEXT: movdqa 16(%edx), %xmm2 ; X86-NEXT: paddw (%ecx), %xmm1 -; X86-NEXT: paddw 16(%ecx), %xmm2 ; X86-NEXT: paddw 32(%ecx), %xmm0 -; X86-NEXT: movd %xmm0, 32(%eax) +; X86-NEXT: paddw 16(%ecx), %xmm2 ; X86-NEXT: movdqa %xmm2, 16(%eax) +; X86-NEXT: movd %xmm0, 32(%eax) ; X86-NEXT: movdqa %xmm1, (%eax) ; X86-NEXT: retl $4 ; @@ -272,10 +272,10 @@ define void @add18i16(%i16vec18* nocapture sret %ret, %i16vec18* %ap, %i16vec18* ; X64-NEXT: movdqa 16(%rsi), %xmm1 ; X64-NEXT: movdqa 32(%rsi), %xmm2 ; X64-NEXT: paddw (%rdx), %xmm0 -; X64-NEXT: paddw 16(%rdx), %xmm1 ; X64-NEXT: paddw 32(%rdx), %xmm2 -; X64-NEXT: movd %xmm2, 32(%rdi) +; X64-NEXT: paddw 16(%rdx), %xmm1 ; X64-NEXT: movdqa %xmm1, 16(%rdi) +; X64-NEXT: movd %xmm2, 32(%rdi) ; X64-NEXT: movdqa %xmm0, (%rdi) ; X64-NEXT: retq %a = load %i16vec18, %i16vec18* %ap, align 16 @@ -331,11 +331,11 @@ define void @add31i8(%i8vec31* nocapture sret %ret, %i8vec31* %ap, %i8vec31* %bp ; X86-NEXT: movdqa 16(%edx), %xmm1 ; X86-NEXT: paddb (%ecx), %xmm0 ; X86-NEXT: paddb 16(%ecx), %xmm1 -; X86-NEXT: pextrb $14, %xmm1, 30(%eax) -; X86-NEXT: pextrw $6, %xmm1, 28(%eax) -; X86-NEXT: pextrd $2, %xmm1, 24(%eax) -; X86-NEXT: pextrd $1, %xmm1, 20(%eax) ; X86-NEXT: movd %xmm1, 16(%eax) +; X86-NEXT: pextrd $1, %xmm1, 20(%eax) +; X86-NEXT: pextrd $2, %xmm1, 24(%eax) +; X86-NEXT: pextrw $6, %xmm1, 28(%eax) +; X86-NEXT: pextrb $14, %xmm1, 30(%eax) ; X86-NEXT: movdqa %xmm0, (%eax) ; X86-NEXT: retl $4 ; @@ -346,10 +346,10 @@ define void @add31i8(%i8vec31* nocapture sret %ret, %i8vec31* %ap, %i8vec31* %bp ; X64-NEXT: movdqa 16(%rsi), %xmm1 ; X64-NEXT: paddb (%rdx), %xmm0 ; X64-NEXT: paddb 16(%rdx), %xmm1 -; X64-NEXT: pextrb $14, %xmm1, 30(%rdi) -; X64-NEXT: pextrw $6, %xmm1, 28(%rdi) -; X64-NEXT: pextrd $2, %xmm1, 24(%rdi) ; X64-NEXT: movq %xmm1, 16(%rdi) +; X64-NEXT: pextrd $2, %xmm1, 24(%rdi) +; X64-NEXT: pextrw $6, %xmm1, 28(%rdi) +; X64-NEXT: pextrb $14, %xmm1, 30(%rdi) ; X64-NEXT: movdqa %xmm0, (%rdi) ; X64-NEXT: retq %a = load %i8vec31, %i8vec31* %ap, align 16 diff --git a/llvm/test/CodeGen/X86/win64_frame.ll b/llvm/test/CodeGen/X86/win64_frame.ll index f95b042..eae02da 100644 --- a/llvm/test/CodeGen/X86/win64_frame.ll +++ b/llvm/test/CodeGen/X86/win64_frame.ll @@ -29,9 +29,9 @@ define void @f2(i32 %p, ...) "no-frame-pointer-elim"="true" { ; ALL-NEXT: movq %rsp, %rbp ; ALL-NEXT: .seh_setframe 5, 0 ; ALL-NEXT: .seh_endprologue -; ALL-NEXT: movq %r9, 48(%rbp) -; ALL-NEXT: movq %r8, 40(%rbp) ; ALL-NEXT: movq %rdx, 32(%rbp) +; ALL-NEXT: movq %r8, 40(%rbp) +; ALL-NEXT: movq %r9, 48(%rbp) ; ALL-NEXT: leaq 32(%rbp), %rax ; ALL-NEXT: movq %rax, (%rbp) ; ALL-NEXT: addq $8, %rsp diff --git a/llvm/test/CodeGen/X86/win64_vararg.ll b/llvm/test/CodeGen/X86/win64_vararg.ll index f0aff6f..91841ce 100644 --- a/llvm/test/CodeGen/X86/win64_vararg.ll +++ b/llvm/test/CodeGen/X86/win64_vararg.ll @@ -6,9 +6,9 @@ define void @average_va(i32 %count, ...) nounwind { entry: ; CHECK: pushq -; CHECK: movq %r9, 40(%rsp) -; CHECK: movq %r8, 32(%rsp) -; CHECK: movq %rdx, 24(%rsp) +; CHECK-DAG: movq %r9, 40(%rsp) +; CHECK-DAG: movq %r8, 32(%rsp) +; CHECK-DAG: movq %rdx, 24(%rsp) ; CHECK: leaq 24(%rsp), %rax %ap = alloca i8*, align 8 ; [#uses=1] diff --git a/llvm/test/CodeGen/X86/x86-64-ms_abi-vararg.ll b/llvm/test/CodeGen/X86/x86-64-ms_abi-vararg.ll index e3387a2..016f18c 100644 --- a/llvm/test/CodeGen/X86/x86-64-ms_abi-vararg.ll +++ b/llvm/test/CodeGen/X86/x86-64-ms_abi-vararg.ll @@ -6,9 +6,9 @@ define win64cc void @average_va(i32 %count, ...) nounwind { entry: ; CHECK: pushq -; CHECK: movq %r9, 40(%rsp) -; CHECK: movq %r8, 32(%rsp) -; CHECK: movq %rdx, 24(%rsp) +; CHECK-DAG: movq %r9, 40(%rsp) +; CHECK-DAG: movq %r8, 32(%rsp) +; CHECK-DAG: movq %rdx, 24(%rsp) ; CHECK: leaq 24(%rsp), %rax %ap = alloca i8*, align 8 ; [#uses=1] @@ -59,8 +59,8 @@ entry: ; CHECK-LABEL: copy1: ; CHECK: leaq 32(%rsp), [[REG_copy1:%[a-z]+]] -; CHECK: movq [[REG_copy1]], 8(%rsp) -; CHECK: movq [[REG_copy1]], (%rsp) +; CHECK-DAG: movq [[REG_copy1]], 8(%rsp) +; CHECK-DAG: movq [[REG_copy1]], (%rsp) ; CHECK: ret define win64cc void @copy1(i64 %a0, ...) nounwind { entry: diff --git a/llvm/test/CodeGen/XCore/byVal.ll b/llvm/test/CodeGen/XCore/byVal.ll index 2c2a6e2..fde63f4 100644 --- a/llvm/test/CodeGen/XCore/byVal.ll +++ b/llvm/test/CodeGen/XCore/byVal.ll @@ -39,8 +39,8 @@ entry: ; CHECK: extsp 4 ; CHECK: stw lr, sp[1] ; CHECK: mov r11, r1 -; CHECK: stw r2, sp[3] -; CHECK: stw r3, sp[4] +; CHECK-DAG: stw r2, sp[3] +; CHECK-DAG: stw r3, sp[4] ; CHECK: ldw r0, r0[0] ; CHECK: stw r0, sp[2] ; CHECK: ldaw r1, sp[2] -- 2.7.4