From a372a0f29560f6946212f41c82205892f99cd3a3 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 1 Apr 2016 20:55:19 +0000 Subject: [PATCH] [X86][SSE] Regenerated the vec_extract tests. llvm-svn: 265183 --- llvm/test/CodeGen/X86/vec_ext_inreg.ll | 74 ++++++++++++- llvm/test/CodeGen/X86/vec_extract-avx.ll | 177 +++++++++++++++++++++--------- llvm/test/CodeGen/X86/vec_extract-mmx.ll | 104 ++++++++++++++---- llvm/test/CodeGen/X86/vec_extract-sse4.ll | 114 +++++++++++-------- llvm/test/CodeGen/X86/vec_extract.ll | 142 ++++++++++++++---------- 5 files changed, 431 insertions(+), 180 deletions(-) diff --git a/llvm/test/CodeGen/X86/vec_ext_inreg.ll b/llvm/test/CodeGen/X86/vec_ext_inreg.ll index 02b16a7..1ee4b24 100644 --- a/llvm/test/CodeGen/X86/vec_ext_inreg.ll +++ b/llvm/test/CodeGen/X86/vec_ext_inreg.ll @@ -1,36 +1,108 @@ -; RUN: llc < %s -march=x86-64 +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 define <8 x i32> @a(<8 x i32> %a) nounwind { +; SSE-LABEL: a: +; SSE: # BB#0: +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: psrad $16, %xmm0 +; SSE-NEXT: pslld $16, %xmm1 +; SSE-NEXT: psrad $16, %xmm1 +; SSE-NEXT: retq +; +; AVX1-LABEL: a: +; AVX1: # BB#0: +; AVX1-NEXT: vpslld $16, %xmm0, %xmm1 +; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; AVX1-NEXT: vpslld $16, %xmm0, %xmm0 +; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: a: +; AVX2: # BB#0: +; AVX2-NEXT: vpslld $16, %ymm0, %ymm0 +; AVX2-NEXT: vpsrad $16, %ymm0, %ymm0 +; AVX2-NEXT: retq %b = trunc <8 x i32> %a to <8 x i16> %c = sext <8 x i16> %b to <8 x i32> ret <8 x i32> %c } define <3 x i32> @b(<3 x i32> %a) nounwind { +; SSE-LABEL: b: +; SSE: # BB#0: +; SSE-NEXT: pslld $16, %xmm0 +; SSE-NEXT: psrad $16, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: b: +; AVX: # BB#0: +; AVX-NEXT: vpslld $16, %xmm0, %xmm0 +; AVX-NEXT: vpsrad $16, %xmm0, %xmm0 +; AVX-NEXT: retq %b = trunc <3 x i32> %a to <3 x i16> %c = sext <3 x i16> %b to <3 x i32> ret <3 x i32> %c } define <1 x i32> @c(<1 x i32> %a) nounwind { +; ALL-LABEL: c: +; ALL: # BB#0: +; ALL-NEXT: movswl %di, %eax +; ALL-NEXT: retq %b = trunc <1 x i32> %a to <1 x i16> %c = sext <1 x i16> %b to <1 x i32> ret <1 x i32> %c } define <8 x i32> @d(<8 x i32> %a) nounwind { +; SSE-LABEL: d: +; SSE: # BB#0: +; SSE-NEXT: movaps {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0] +; SSE-NEXT: andps %xmm2, %xmm0 +; SSE-NEXT: andps %xmm2, %xmm1 +; SSE-NEXT: retq +; +; AVX1-LABEL: d: +; AVX1: # BB#0: +; AVX1-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: d: +; AVX2: # BB#0: +; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; AVX2-NEXT: retq %b = trunc <8 x i32> %a to <8 x i16> %c = zext <8 x i16> %b to <8 x i32> ret <8 x i32> %c } define <3 x i32> @e(<3 x i32> %a) nounwind { +; SSE-LABEL: e: +; SSE: # BB#0: +; SSE-NEXT: andps {{.*}}(%rip), %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: e: +; AVX: # BB#0: +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6,7] +; AVX-NEXT: retq %b = trunc <3 x i32> %a to <3 x i16> %c = zext <3 x i16> %b to <3 x i32> ret <3 x i32> %c } define <1 x i32> @f(<1 x i32> %a) nounwind { +; ALL-LABEL: f: +; ALL: # BB#0: +; ALL-NEXT: movzwl %di, %eax +; ALL-NEXT: retq %b = trunc <1 x i32> %a to <1 x i16> %c = zext <1 x i16> %b to <1 x i32> ret <1 x i32> %c diff --git a/llvm/test/CodeGen/X86/vec_extract-avx.ll b/llvm/test/CodeGen/X86/vec_extract-avx.ll index abb0723..7286b4c 100644 --- a/llvm/test/CodeGen/X86/vec_extract-avx.ll +++ b/llvm/test/CodeGen/X86/vec_extract-avx.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=X32 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=CHECK --check-prefix=X64 ; When extracting multiple consecutive elements from a larger ; vector into a smaller one, do it efficiently. We should use @@ -8,11 +9,18 @@ ; Extracting the low elements only requires using the right kind of store. define void @low_v8f32_to_v4f32(<8 x float> %v, <4 x float>* %ptr) { -; CHECK-LABEL: low_v8f32_to_v4f32: -; CHECK: # BB#0: -; CHECK-NEXT: vmovaps %xmm0, (%rdi) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; X32-LABEL: low_v8f32_to_v4f32: +; X32: # BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: vmovaps %xmm0, (%eax) +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: low_v8f32_to_v4f32: +; X64: # BB#0: +; X64-NEXT: vmovaps %xmm0, (%rdi) +; X64-NEXT: vzeroupper +; X64-NEXT: retq %ext0 = extractelement <8 x float> %v, i32 0 %ext1 = extractelement <8 x float> %v, i32 1 %ext2 = extractelement <8 x float> %v, i32 2 @@ -27,11 +35,18 @@ define void @low_v8f32_to_v4f32(<8 x float> %v, <4 x float>* %ptr) { ; Extracting the high elements requires just one AVX instruction. define void @high_v8f32_to_v4f32(<8 x float> %v, <4 x float>* %ptr) { -; CHECK-LABEL: high_v8f32_to_v4f32: -; CHECK: # BB#0: -; CHECK-NEXT: vextractf128 $1, %ymm0, (%rdi) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; X32-LABEL: high_v8f32_to_v4f32: +; X32: # BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: vextractf128 $1, %ymm0, (%eax) +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: high_v8f32_to_v4f32: +; X64: # BB#0: +; X64-NEXT: vextractf128 $1, %ymm0, (%rdi) +; X64-NEXT: vzeroupper +; X64-NEXT: retq %ext0 = extractelement <8 x float> %v, i32 4 %ext1 = extractelement <8 x float> %v, i32 5 %ext2 = extractelement <8 x float> %v, i32 6 @@ -48,11 +63,18 @@ define void @high_v8f32_to_v4f32(<8 x float> %v, <4 x float>* %ptr) { ; if we were actually using the vector in this function and ; have AVX2, we should generate vextracti128 (the int version). define void @high_v8i32_to_v4i32(<8 x i32> %v, <4 x i32>* %ptr) { -; CHECK-LABEL: high_v8i32_to_v4i32: -; CHECK: # BB#0: -; CHECK-NEXT: vextractf128 $1, %ymm0, (%rdi) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; X32-LABEL: high_v8i32_to_v4i32: +; X32: # BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: vextractf128 $1, %ymm0, (%eax) +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: high_v8i32_to_v4i32: +; X64: # BB#0: +; X64-NEXT: vextractf128 $1, %ymm0, (%rdi) +; X64-NEXT: vzeroupper +; X64-NEXT: retq %ext0 = extractelement <8 x i32> %v, i32 4 %ext1 = extractelement <8 x i32> %v, i32 5 %ext2 = extractelement <8 x i32> %v, i32 6 @@ -67,11 +89,18 @@ define void @high_v8i32_to_v4i32(<8 x i32> %v, <4 x i32>* %ptr) { ; Make sure that element size doesn't alter the codegen. define void @high_v4f64_to_v2f64(<4 x double> %v, <2 x double>* %ptr) { -; CHECK-LABEL: high_v4f64_to_v2f64: -; CHECK: # BB#0: -; CHECK-NEXT: vextractf128 $1, %ymm0, (%rdi) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; X32-LABEL: high_v4f64_to_v2f64: +; X32: # BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: vextractf128 $1, %ymm0, (%eax) +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: high_v4f64_to_v2f64: +; X64: # BB#0: +; X64-NEXT: vextractf128 $1, %ymm0, (%rdi) +; X64-NEXT: vzeroupper +; X64-NEXT: retq %ext0 = extractelement <4 x double> %v, i32 2 %ext1 = extractelement <4 x double> %v, i32 3 %ins0 = insertelement <2 x double> undef, double %ext0, i32 0 @@ -84,14 +113,25 @@ define void @high_v4f64_to_v2f64(<4 x double> %v, <2 x double>* %ptr) { ; FIXME - Ideally these should just call VMOVD/VMOVQ/VMOVSS/VMOVSD define void @legal_vzmovl_2i32_8i32(<2 x i32>* %in, <8 x i32>* %out) { -; CHECK-LABEL: legal_vzmovl_2i32_8i32: -; CHECK: # BB#0: -; CHECK-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero -; CHECK-NEXT: vxorps %ymm1, %ymm1, %ymm1 -; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] -; CHECK-NEXT: vmovaps %ymm0, (%rsi) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; X32-LABEL: legal_vzmovl_2i32_8i32: +; X32: # BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero +; X32-NEXT: vxorps %ymm1, %ymm1, %ymm1 +; X32-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] +; X32-NEXT: vmovaps %ymm0, (%eax) +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: legal_vzmovl_2i32_8i32: +; X64: # BB#0: +; X64-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero +; X64-NEXT: vxorps %ymm1, %ymm1, %ymm1 +; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] +; X64-NEXT: vmovaps %ymm0, (%rsi) +; X64-NEXT: vzeroupper +; X64-NEXT: retq %ld = load <2 x i32>, <2 x i32>* %in, align 8 %ext = extractelement <2 x i32> %ld, i64 0 %ins = insertelement <8 x i32> , i32 %ext, i64 0 @@ -100,14 +140,25 @@ define void @legal_vzmovl_2i32_8i32(<2 x i32>* %in, <8 x i32>* %out) { } define void @legal_vzmovl_2i64_4i64(<2 x i64>* %in, <4 x i64>* %out) { -; CHECK-LABEL: legal_vzmovl_2i64_4i64: -; CHECK: # BB#0: -; CHECK-NEXT: vmovupd (%rdi), %xmm0 -; CHECK-NEXT: vxorpd %ymm1, %ymm1, %ymm1 -; CHECK-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] -; CHECK-NEXT: vmovapd %ymm0, (%rsi) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; X32-LABEL: legal_vzmovl_2i64_4i64: +; X32: # BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: vmovupd (%ecx), %xmm0 +; X32-NEXT: vxorpd %ymm1, %ymm1, %ymm1 +; X32-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] +; X32-NEXT: vmovapd %ymm0, (%eax) +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: legal_vzmovl_2i64_4i64: +; X64: # BB#0: +; X64-NEXT: vmovupd (%rdi), %xmm0 +; X64-NEXT: vxorpd %ymm1, %ymm1, %ymm1 +; X64-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] +; X64-NEXT: vmovapd %ymm0, (%rsi) +; X64-NEXT: vzeroupper +; X64-NEXT: retq %ld = load <2 x i64>, <2 x i64>* %in, align 8 %ext = extractelement <2 x i64> %ld, i64 0 %ins = insertelement <4 x i64> , i64 %ext, i64 0 @@ -116,14 +167,23 @@ define void @legal_vzmovl_2i64_4i64(<2 x i64>* %in, <4 x i64>* %out) { } define void @legal_vzmovl_2f32_8f32(<2 x float>* %in, <8 x float>* %out) { -; CHECK-LABEL: legal_vzmovl_2f32_8f32: -; CHECK: # BB#0: -; CHECK-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: vxorps %ymm1, %ymm1, %ymm1 -; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] -; CHECK-NEXT: vmovaps %ymm0, (%rsi) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; X32-LABEL: legal_vzmovl_2f32_8f32: +; X32: # BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-NEXT: vmovaps %ymm0, (%eax) +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: legal_vzmovl_2f32_8f32: +; X64: # BB#0: +; X64-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero +; X64-NEXT: vxorps %ymm1, %ymm1, %ymm1 +; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7] +; X64-NEXT: vmovaps %ymm0, (%rsi) +; X64-NEXT: vzeroupper +; X64-NEXT: retq %ld = load <2 x float>, <2 x float>* %in, align 8 %ext = extractelement <2 x float> %ld, i64 0 %ins = insertelement <8 x float> , float %ext, i64 0 @@ -132,14 +192,25 @@ define void @legal_vzmovl_2f32_8f32(<2 x float>* %in, <8 x float>* %out) { } define void @legal_vzmovl_2f64_4f64(<2 x double>* %in, <4 x double>* %out) { -; CHECK-LABEL: legal_vzmovl_2f64_4f64: -; CHECK: # BB#0: -; CHECK-NEXT: vmovupd (%rdi), %xmm0 -; CHECK-NEXT: vxorpd %ymm1, %ymm1, %ymm1 -; CHECK-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] -; CHECK-NEXT: vmovapd %ymm0, (%rsi) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; X32-LABEL: legal_vzmovl_2f64_4f64: +; X32: # BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: vmovupd (%ecx), %xmm0 +; X32-NEXT: vxorpd %ymm1, %ymm1, %ymm1 +; X32-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] +; X32-NEXT: vmovapd %ymm0, (%eax) +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: legal_vzmovl_2f64_4f64: +; X64: # BB#0: +; X64-NEXT: vmovupd (%rdi), %xmm0 +; X64-NEXT: vxorpd %ymm1, %ymm1, %ymm1 +; X64-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3] +; X64-NEXT: vmovapd %ymm0, (%rsi) +; X64-NEXT: vzeroupper +; X64-NEXT: retq %ld = load <2 x double>, <2 x double>* %in, align 8 %ext = extractelement <2 x double> %ld, i64 0 %ins = insertelement <4 x double> , double %ext, i64 0 diff --git a/llvm/test/CodeGen/X86/vec_extract-mmx.ll b/llvm/test/CodeGen/X86/vec_extract-mmx.ll index 780066d..6d64a9e 100644 --- a/llvm/test/CodeGen/X86/vec_extract-mmx.ll +++ b/llvm/test/CodeGen/X86/vec_extract-mmx.ll @@ -1,12 +1,35 @@ -; RUN: llc < %s -march=x86-64 -mattr=+mmx,+sse2 | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X32 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X64 -define i32 @test0(<1 x i64>* %v4) { -; CHECK-LABEL: test0: -; CHECK: # BB#0:{{.*}} %entry -; CHECK: pshufw $238, (%[[REG:[a-z]+]]), %mm0 -; CHECK-NEXT: movd %mm0, %eax -; CHECK-NEXT: addl $32, %eax -; CHECK-NEXT: retq +define i32 @test0(<1 x i64>* %v4) nounwind { +; X32-LABEL: test0: +; X32: # BB#0: # %entry +; X32-NEXT: pushl %ebp +; X32-NEXT: movl %esp, %ebp +; X32-NEXT: andl $-8, %esp +; X32-NEXT: subl $24, %esp +; X32-NEXT: movl 8(%ebp), %eax +; X32-NEXT: movl (%eax), %ecx +; X32-NEXT: movl 4(%eax), %eax +; X32-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X32-NEXT: movl %ecx, (%esp) +; X32-NEXT: pshufw $238, (%esp), %mm0 # mm0 = mem[2,3,2,3] +; X32-NEXT: movq %mm0, {{[0-9]+}}(%esp) +; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X32-NEXT: movd %xmm0, %eax +; X32-NEXT: addl $32, %eax +; X32-NEXT: movl %ebp, %esp +; X32-NEXT: popl %ebp +; X32-NEXT: retl +; +; X64-LABEL: test0: +; X64: # BB#0: # %entry +; X64-NEXT: pshufw $238, (%rdi), %mm0 # mm0 = mem[2,3,2,3] +; X64-NEXT: movd %mm0, %eax +; X64-NEXT: addl $32, %eax +; X64-NEXT: retq entry: %v5 = load <1 x i64>, <1 x i64>* %v4, align 8 %v12 = bitcast <1 x i64> %v5 to <4 x i16> @@ -21,14 +44,32 @@ entry: ret i32 %v20 } -define i32 @test1(i32* nocapture readonly %ptr) { -; CHECK-LABEL: test1: -; CHECK: # BB#0:{{.*}} %entry -; CHECK: movd (%[[REG]]), %mm0 -; CHECK-NEXT: pshufw $232, %mm0, %mm0 -; CHECK-NEXT: movd %mm0, %eax -; CHECK-NEXT: emms -; CHECK-NEXT: retq +define i32 @test1(i32* nocapture readonly %ptr) nounwind { +; X32-LABEL: test1: +; X32: # BB#0: # %entry +; X32-NEXT: pushl %ebp +; X32-NEXT: movl %esp, %ebp +; X32-NEXT: andl $-8, %esp +; X32-NEXT: subl $16, %esp +; X32-NEXT: movl 8(%ebp), %eax +; X32-NEXT: movd (%eax), %mm0 +; X32-NEXT: pshufw $232, %mm0, %mm0 # mm0 = mm0[0,2,2,3] +; X32-NEXT: movq %mm0, (%esp) +; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X32-NEXT: movd %xmm0, %eax +; X32-NEXT: emms +; X32-NEXT: movl %ebp, %esp +; X32-NEXT: popl %ebp +; X32-NEXT: retl +; +; X64-LABEL: test1: +; X64: # BB#0: # %entry +; X64-NEXT: movd (%rdi), %mm0 +; X64-NEXT: pshufw $232, %mm0, %mm0 # mm0 = mm0[0,2,2,3] +; X64-NEXT: movd %mm0, %eax +; X64-NEXT: emms +; X64-NEXT: retq entry: %0 = load i32, i32* %ptr, align 4 %1 = insertelement <2 x i32> undef, i32 %0, i32 0 @@ -47,13 +88,30 @@ entry: ret i32 %12 } -define i32 @test2(i32* nocapture readonly %ptr) { -; CHECK-LABEL: test2: -; CHECK: # BB#0:{{.*}} %entry -; CHECK: pshufw $232, (%[[REG]]), %mm0 -; CHECK-NEXT: movd %mm0, %eax -; CHECK-NEXT: emms -; CHECK-NEXT: retq +define i32 @test2(i32* nocapture readonly %ptr) nounwind { +; X32-LABEL: test2: +; X32: # BB#0: # %entry +; X32-NEXT: pushl %ebp +; X32-NEXT: movl %esp, %ebp +; X32-NEXT: andl $-8, %esp +; X32-NEXT: subl $16, %esp +; X32-NEXT: movl 8(%ebp), %eax +; X32-NEXT: pshufw $232, (%eax), %mm0 # mm0 = mem[0,2,2,3] +; X32-NEXT: movq %mm0, (%esp) +; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X32-NEXT: movd %xmm0, %eax +; X32-NEXT: emms +; X32-NEXT: movl %ebp, %esp +; X32-NEXT: popl %ebp +; X32-NEXT: retl +; +; X64-LABEL: test2: +; X64: # BB#0: # %entry +; X64-NEXT: pshufw $232, (%rdi), %mm0 # mm0 = mem[0,2,2,3] +; X64-NEXT: movd %mm0, %eax +; X64-NEXT: emms +; X64-NEXT: retq entry: %0 = bitcast i32* %ptr to x86_mmx* %1 = load x86_mmx, x86_mmx* %0, align 8 diff --git a/llvm/test/CodeGen/X86/vec_extract-sse4.ll b/llvm/test/CodeGen/X86/vec_extract-sse4.ll index cea0d64..f073f15 100644 --- a/llvm/test/CodeGen/X86/vec_extract-sse4.ll +++ b/llvm/test/CodeGen/X86/vec_extract-sse4.ll @@ -1,59 +1,79 @@ -; RUN: llc < %s -mcpu=corei7 -march=x86 -mattr=+sse4.1 | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X32 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X64 define void @t1(float* %R, <4 x float>* %P1) nounwind { -; CHECK-LABEL: t1: -; CHECK: # BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movss 12(%ecx), %xmm0 -; CHECK-NEXT: movss %xmm0, (%eax) -; CHECK-NEXT: retl - - %X = load <4 x float>, <4 x float>* %P1 - %tmp = extractelement <4 x float> %X, i32 3 - store float %tmp, float* %R - ret void +; X32-LABEL: t1: +; X32: # BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-NEXT: movss %xmm0, (%eax) +; X32-NEXT: retl +; +; X64-LABEL: t1: +; X64: # BB#0: +; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT: movss %xmm0, (%rdi) +; X64-NEXT: retq + %X = load <4 x float>, <4 x float>* %P1 + %tmp = extractelement <4 x float> %X, i32 3 + store float %tmp, float* %R + ret void } define float @t2(<4 x float>* %P1) nounwind { -; CHECK-LABEL: t2: -; CHECK: # BB#0: -; CHECK-NEXT: pushl %eax -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movddup {{.*#+}} xmm0 = mem[0,0] -; CHECK-NEXT: movss %xmm0, (%esp) -; CHECK-NEXT: flds (%esp) -; CHECK-NEXT: popl %eax -; CHECK-NEXT: retl - - %X = load <4 x float>, <4 x float>* %P1 - %tmp = extractelement <4 x float> %X, i32 2 - ret float %tmp +; X32-LABEL: t2: +; X32: # BB#0: +; X32-NEXT: pushl %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movddup {{.*#+}} xmm0 = mem[0,0] +; X32-NEXT: movss %xmm0, (%esp) +; X32-NEXT: flds (%esp) +; X32-NEXT: popl %eax +; X32-NEXT: retl +; +; X64-LABEL: t2: +; X64: # BB#0: +; X64-NEXT: movddup {{.*#+}} xmm0 = mem[0,0] +; X64-NEXT: retq + %X = load <4 x float>, <4 x float>* %P1 + %tmp = extractelement <4 x float> %X, i32 2 + ret float %tmp } define void @t3(i32* %R, <4 x i32>* %P1) nounwind { -; CHECK-LABEL: t3: -; CHECK: # BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movl 12(%ecx), %ecx -; CHECK-NEXT: movl %ecx, (%eax) -; CHECK-NEXT: retl - - %X = load <4 x i32>, <4 x i32>* %P1 - %tmp = extractelement <4 x i32> %X, i32 3 - store i32 %tmp, i32* %R - ret void +; X32-LABEL: t3: +; X32: # BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movl 12(%ecx), %ecx +; X32-NEXT: movl %ecx, (%eax) +; X32-NEXT: retl +; +; X64-LABEL: t3: +; X64: # BB#0: +; X64-NEXT: movl 12(%rsi), %eax +; X64-NEXT: movl %eax, (%rdi) +; X64-NEXT: retq + %X = load <4 x i32>, <4 x i32>* %P1 + %tmp = extractelement <4 x i32> %X, i32 3 + store i32 %tmp, i32* %R + ret void } define i32 @t4(<4 x i32>* %P1) nounwind { -; CHECK-LABEL: t4: -; CHECK: # BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl 12(%eax), %eax -; CHECK-NEXT: retl - - %X = load <4 x i32>, <4 x i32>* %P1 - %tmp = extractelement <4 x i32> %X, i32 3 - ret i32 %tmp +; X32-LABEL: t4: +; X32: # BB#0: +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl 12(%eax), %eax +; X32-NEXT: retl +; +; X64-LABEL: t4: +; X64: # BB#0: +; X64-NEXT: movl 12(%rdi), %eax +; X64-NEXT: retq + %X = load <4 x i32>, <4 x i32>* %P1 + %tmp = extractelement <4 x i32> %X, i32 3 + ret i32 %tmp } diff --git a/llvm/test/CodeGen/X86/vec_extract.ll b/llvm/test/CodeGen/X86/vec_extract.ll index 3b47888..47f719d 100644 --- a/llvm/test/CodeGen/X86/vec_extract.ll +++ b/llvm/test/CodeGen/X86/vec_extract.ll @@ -1,74 +1,104 @@ -; RUN: llc < %s -mcpu=corei7 -march=x86 -mattr=+sse2,-sse4.1 | FileCheck %s - -target triple = "x86_64-unknown-linux-gnu" +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,-sse4.1 | FileCheck %s --check-prefix=X32 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2,-sse4.1 | FileCheck %s --check-prefix=X64 define void @test1(<4 x float>* %F, float* %f) nounwind { -; CHECK-LABEL: test1: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movaps (%ecx), %xmm0 -; CHECK-NEXT: addps %xmm0, %xmm0 -; CHECK-NEXT: movss %xmm0, (%eax) -; CHECK-NEXT: retl +; X32-LABEL: test1: +; X32: # BB#0: # %entry +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movaps (%ecx), %xmm0 +; X32-NEXT: addps %xmm0, %xmm0 +; X32-NEXT: movss %xmm0, (%eax) +; X32-NEXT: retl +; +; X64-LABEL: test1: +; X64: # BB#0: # %entry +; X64-NEXT: movaps (%rdi), %xmm0 +; X64-NEXT: addps %xmm0, %xmm0 +; X64-NEXT: movss %xmm0, (%rsi) +; X64-NEXT: retq entry: - %tmp = load <4 x float>, <4 x float>* %F ; <<4 x float>> [#uses=2] - %tmp7 = fadd <4 x float> %tmp, %tmp ; <<4 x float>> [#uses=1] - %tmp2 = extractelement <4 x float> %tmp7, i32 0 ; [#uses=1] - store float %tmp2, float* %f - ret void + %tmp = load <4 x float>, <4 x float>* %F + %tmp7 = fadd <4 x float> %tmp, %tmp + %tmp2 = extractelement <4 x float> %tmp7, i32 0 + store float %tmp2, float* %f + ret void } define float @test2(<4 x float>* %F, float* %f) nounwind { -; CHECK-LABEL: test2: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: pushl %eax -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movaps (%eax), %xmm0 -; CHECK-NEXT: addps %xmm0, %xmm0 -; CHECK-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] -; CHECK-NEXT: movss %xmm0, (%esp) -; CHECK-NEXT: flds (%esp) -; CHECK-NEXT: popl %eax -; CHECK-NEXT: retl +; X32-LABEL: test2: +; X32: # BB#0: # %entry +; X32-NEXT: pushl %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movaps (%eax), %xmm0 +; X32-NEXT: addps %xmm0, %xmm0 +; X32-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] +; X32-NEXT: movss %xmm0, (%esp) +; X32-NEXT: flds (%esp) +; X32-NEXT: popl %eax +; X32-NEXT: retl +; +; X64-LABEL: test2: +; X64: # BB#0: # %entry +; X64-NEXT: movaps (%rdi), %xmm0 +; X64-NEXT: addps %xmm0, %xmm0 +; X64-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] +; X64-NEXT: retq entry: - %tmp = load <4 x float>, <4 x float>* %F ; <<4 x float>> [#uses=2] - %tmp7 = fadd <4 x float> %tmp, %tmp ; <<4 x float>> [#uses=1] - %tmp2 = extractelement <4 x float> %tmp7, i32 2 ; [#uses=1] - ret float %tmp2 + %tmp = load <4 x float>, <4 x float>* %F + %tmp7 = fadd <4 x float> %tmp, %tmp + %tmp2 = extractelement <4 x float> %tmp7, i32 2 + ret float %tmp2 } define void @test3(float* %R, <4 x float>* %P1) nounwind { -; CHECK-LABEL: test3: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movss 12(%ecx), %xmm0 -; CHECK-NEXT: movss %xmm0, (%eax) -; CHECK-NEXT: retl +; X32-LABEL: test3: +; X32: # BB#0: # %entry +; X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X32-NEXT: movss %xmm0, (%eax) +; X32-NEXT: retl +; +; X64-LABEL: test3: +; X64: # BB#0: # %entry +; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT: movss %xmm0, (%rdi) +; X64-NEXT: retq entry: - %X = load <4 x float>, <4 x float>* %P1 ; <<4 x float>> [#uses=1] - %tmp = extractelement <4 x float> %X, i32 3 ; [#uses=1] - store float %tmp, float* %R - ret void + %X = load <4 x float>, <4 x float>* %P1 + %tmp = extractelement <4 x float> %X, i32 3 + store float %tmp, float* %R + ret void } define double @test4(double %A) nounwind { -; CHECK-LABEL: test4: -; CHECK: # BB#0: # %entry -; CHECK-NEXT: subl $12, %esp -; CHECK-NEXT: calll foo -; CHECK-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] -; CHECK-NEXT: addsd {{[0-9]+}}(%esp), %xmm0 -; CHECK-NEXT: movsd %xmm0, (%esp) -; CHECK-NEXT: fldl (%esp) -; CHECK-NEXT: addl $12, %esp -; CHECK-NEXT: retl +; X32-LABEL: test4: +; X32: # BB#0: # %entry +; X32-NEXT: subl $12, %esp +; X32-NEXT: calll foo +; X32-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] +; X32-NEXT: addsd {{[0-9]+}}(%esp), %xmm0 +; X32-NEXT: movsd %xmm0, (%esp) +; X32-NEXT: fldl (%esp) +; X32-NEXT: addl $12, %esp +; X32-NEXT: retl +; +; X64-LABEL: test4: +; X64: # BB#0: # %entry +; X64-NEXT: pushq %rax +; X64-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill +; X64-NEXT: callq foo +; X64-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1,0] +; X64-NEXT: addsd (%rsp), %xmm0 # 8-byte Folded Reload +; X64-NEXT: popq %rax +; X64-NEXT: retq entry: - %tmp1 = call <2 x double> @foo( ) ; <<2 x double>> [#uses=1] - %tmp2 = extractelement <2 x double> %tmp1, i32 1 ; [#uses=1] - %tmp3 = fadd double %tmp2, %A ; [#uses=1] - ret double %tmp3 + %tmp1 = call <2 x double> @foo( ) + %tmp2 = extractelement <2 x double> %tmp1, i32 1 + %tmp3 = fadd double %tmp2, %A + ret double %tmp3 } declare <2 x double> @foo() -- 2.7.4