From: Simon Pilgrim Date: Sat, 6 Oct 2018 20:24:27 +0000 (+0000) Subject: [X86][SSE] Add SSE41 vector int2fp tests X-Git-Tag: llvmorg-8.0.0-rc1~7099 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=0dcf1cea038ebfdfc12b3d8dad10a6c0434d6ba5;p=platform%2Fupstream%2Fllvm.git [X86][SSE] Add SSE41 vector int2fp tests llvm-svn: 343925 --- diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll index 534485e..2b8ceeb 100644 --- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -1,33 +1,45 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=VEX --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=VEX --check-prefix=AVX2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512VL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512DQ -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512VLDQ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=ALL,SSE,SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=ALL,SSE,SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,AVX,VEX,AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,VEX,AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512DQ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512VLDQ ; ; 32-bit tests to make sure we're not doing anything stupid. ; RUN: llc < %s -mtriple=i686-unknown-unknown ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 ; ; Signed Integer to Double ; define <2 x double> @sitofp_2i64_to_2f64(<2 x i64> %a) { -; SSE-LABEL: sitofp_2i64_to_2f64: -; SSE: # %bb.0: -; SSE-NEXT: movq %xmm0, %rax -; SSE-NEXT: cvtsi2sdq %rax, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE-NEXT: movq %xmm0, %rax -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: cvtsi2sdq %rax, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: sitofp_2i64_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: cvtsi2sdq %rax, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2sdq %rax, %xmm0 +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_2i64_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: cvtsi2sdq %rax, %xmm1 +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2sdq %rax, %xmm0 +; SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE41-NEXT: retq ; ; VEX-LABEL: sitofp_2i64_to_2f64: ; VEX: # %bb.0: @@ -103,12 +115,18 @@ define <2 x double> @sitofp_4i32_to_2f64(<4 x i32> %a) { } define <2 x double> @sitofp_2i16_to_2f64(<8 x i16> %a) { -; SSE-LABEL: sitofp_2i16_to_2f64: -; SSE: # %bb.0: -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: sitofp_2i16_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_2i16_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxwd %xmm0, %xmm0 +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: sitofp_2i16_to_2f64: ; AVX: # %bb.0: @@ -121,12 +139,18 @@ define <2 x double> @sitofp_2i16_to_2f64(<8 x i16> %a) { } define <2 x double> @sitofp_8i16_to_2f64(<8 x i16> %a) { -; SSE-LABEL: sitofp_8i16_to_2f64: -; SSE: # %bb.0: -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: sitofp_8i16_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_8i16_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxwd %xmm0, %xmm0 +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: retq ; ; VEX-LABEL: sitofp_8i16_to_2f64: ; VEX: # %bb.0: @@ -147,13 +171,19 @@ define <2 x double> @sitofp_8i16_to_2f64(<8 x i16> %a) { } define <2 x double> @sitofp_2i8_to_2f64(<16 x i8> %a) { -; SSE-LABEL: sitofp_2i8_to_2f64: -; SSE: # %bb.0: -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE-NEXT: psrad $24, %xmm0 -; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: sitofp_2i8_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_2i8_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbd %xmm0, %xmm0 +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: sitofp_2i8_to_2f64: ; AVX: # %bb.0: @@ -166,13 +196,19 @@ define <2 x double> @sitofp_2i8_to_2f64(<16 x i8> %a) { } define <2 x double> @sitofp_16i8_to_2f64(<16 x i8> %a) { -; SSE-LABEL: sitofp_16i8_to_2f64: -; SSE: # %bb.0: -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE-NEXT: psrad $24, %xmm0 -; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: sitofp_16i8_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_16i8_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbd %xmm0, %xmm0 +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: retq ; ; VEX-LABEL: sitofp_16i8_to_2f64: ; VEX: # %bb.0: @@ -193,25 +229,42 @@ define <2 x double> @sitofp_16i8_to_2f64(<16 x i8> %a) { } define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) { -; SSE-LABEL: sitofp_4i64_to_4f64: -; SSE: # %bb.0: -; SSE-NEXT: movq %xmm0, %rax -; SSE-NEXT: cvtsi2sdq %rax, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE-NEXT: movq %xmm0, %rax -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: cvtsi2sdq %rax, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] -; SSE-NEXT: movq %xmm1, %rax -; SSE-NEXT: cvtsi2sdq %rax, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE-NEXT: movq %xmm0, %rax -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: cvtsi2sdq %rax, %xmm0 -; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: movaps %xmm3, %xmm1 -; SSE-NEXT: retq +; SSE2-LABEL: sitofp_4i64_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: cvtsi2sdq %rax, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2sdq %rax, %xmm0 +; SSE2-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: cvtsi2sdq %rax, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2sdq %rax, %xmm0 +; SSE2-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: movaps %xmm3, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_4i64_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: cvtsi2sdq %rax, %xmm2 +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2sdq %rax, %xmm0 +; SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE41-NEXT: pextrq $1, %xmm1, %rax +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: cvtsi2sdq %rax, %xmm2 +; SSE41-NEXT: movq %xmm1, %rax +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: cvtsi2sdq %rax, %xmm1 +; SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE41-NEXT: retq ; ; AVX1-LABEL: sitofp_4i64_to_4f64: ; AVX1: # %bb.0: @@ -310,14 +363,22 @@ define <4 x double> @sitofp_4i32_to_4f64(<4 x i32> %a) { } define <4 x double> @sitofp_4i16_to_4f64(<8 x i16> %a) { -; SSE-LABEL: sitofp_4i16_to_4f64: -; SSE: # %bb.0: -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: psrad $16, %xmm1 -; SSE-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE-NEXT: retq +; SSE2-LABEL: sitofp_4i16_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_4i16_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxwd %xmm0, %xmm1 +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: retq ; ; AVX-LABEL: sitofp_4i16_to_4f64: ; AVX: # %bb.0: @@ -330,14 +391,22 @@ define <4 x double> @sitofp_4i16_to_4f64(<8 x i16> %a) { } define <4 x double> @sitofp_8i16_to_4f64(<8 x i16> %a) { -; SSE-LABEL: sitofp_8i16_to_4f64: -; SSE: # %bb.0: -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: psrad $16, %xmm1 -; SSE-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE-NEXT: retq +; SSE2-LABEL: sitofp_8i16_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_8i16_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxwd %xmm0, %xmm1 +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: retq ; ; VEX-LABEL: sitofp_8i16_to_4f64: ; VEX: # %bb.0: @@ -357,15 +426,23 @@ define <4 x double> @sitofp_8i16_to_4f64(<8 x i16> %a) { } define <4 x double> @sitofp_4i8_to_4f64(<16 x i8> %a) { -; SSE-LABEL: sitofp_4i8_to_4f64: -; SSE: # %bb.0: -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: psrad $24, %xmm1 -; SSE-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE-NEXT: retq +; SSE2-LABEL: sitofp_4i8_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: psrad $24, %xmm1 +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_4i8_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbd %xmm0, %xmm1 +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: retq ; ; AVX-LABEL: sitofp_4i8_to_4f64: ; AVX: # %bb.0: @@ -378,15 +455,23 @@ define <4 x double> @sitofp_4i8_to_4f64(<16 x i8> %a) { } define <4 x double> @sitofp_16i8_to_4f64(<16 x i8> %a) { -; SSE-LABEL: sitofp_16i8_to_4f64: -; SSE: # %bb.0: -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: psrad $24, %xmm1 -; SSE-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE-NEXT: retq +; SSE2-LABEL: sitofp_16i8_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: psrad $24, %xmm1 +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_16i8_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbd %xmm0, %xmm1 +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: retq ; ; VEX-LABEL: sitofp_16i8_to_4f64: ; VEX: # %bb.0: @@ -410,24 +495,36 @@ define <4 x double> @sitofp_16i8_to_4f64(<16 x i8> %a) { ; define <2 x double> @uitofp_2i64_to_2f64(<2 x i64> %a) { -; SSE-LABEL: uitofp_2i64_to_2f64: -; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movapd {{.*#+}} xmm4 = [4503599627370496,1.9342813113834067E+25] -; SSE-NEXT: subpd %xmm4, %xmm0 -; SSE-NEXT: movapd %xmm0, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] -; SSE-NEXT: addpd %xmm0, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: subpd %xmm4, %xmm3 -; SSE-NEXT: movapd %xmm3, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] -; SSE-NEXT: addpd %xmm3, %xmm0 -; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE-NEXT: movapd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: uitofp_2i64_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: movapd {{.*#+}} xmm4 = [4503599627370496,1.9342813113834067E+25] +; SSE2-NEXT: subpd %xmm4, %xmm0 +; SSE2-NEXT: movapd %xmm0, %xmm1 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1] +; SSE2-NEXT: addpd %xmm0, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE2-NEXT: subpd %xmm4, %xmm3 +; SSE2-NEXT: movapd %xmm3, %xmm0 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1] +; SSE2-NEXT: addpd %xmm3, %xmm0 +; SSE2-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_2i64_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE41-NEXT: movapd {{.*#+}} xmm3 = [4503599627370496,1.9342813113834067E+25] +; SSE41-NEXT: subpd %xmm3, %xmm0 +; SSE41-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE41-NEXT: subpd %xmm3, %xmm2 +; SSE41-NEXT: haddpd %xmm2, %xmm0 +; SSE41-NEXT: retq ; ; VEX-LABEL: uitofp_2i64_to_2f64: ; VEX: # %bb.0: @@ -476,16 +573,27 @@ define <2 x double> @uitofp_2i64_to_2f64(<2 x i64> %a) { } define <2 x double> @uitofp_2i32_to_2f64(<4 x i32> %a) { -; SSE-LABEL: uitofp_2i32_to_2f64: -; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE-NEXT: mulpd {{.*}}(%rip), %xmm0 -; SSE-NEXT: addpd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: uitofp_2i32_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: mulpd {{.*}}(%rip), %xmm0 +; SSE2-NEXT: addpd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_2i32_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: mulpd {{.*}}(%rip), %xmm0 +; SSE41-NEXT: addpd %xmm1, %xmm0 +; SSE41-NEXT: retq ; ; VEX-LABEL: uitofp_2i32_to_2f64: ; VEX: # %bb.0: @@ -529,16 +637,27 @@ define <2 x double> @uitofp_2i32_to_2f64(<4 x i32> %a) { } define <2 x double> @uitofp_4i32_to_2f64(<4 x i32> %a) { -; SSE-LABEL: uitofp_4i32_to_2f64: -; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE-NEXT: mulpd {{.*}}(%rip), %xmm0 -; SSE-NEXT: addpd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: uitofp_4i32_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,65535,0,65535,0] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: mulpd {{.*}}(%rip), %xmm0 +; SSE2-NEXT: addpd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_4i32_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: mulpd {{.*}}(%rip), %xmm0 +; SSE41-NEXT: addpd %xmm1, %xmm0 +; SSE41-NEXT: retq ; ; AVX1-LABEL: uitofp_4i32_to_2f64: ; AVX1: # %bb.0: @@ -602,12 +721,18 @@ define <2 x double> @uitofp_4i32_to_2f64(<4 x i32> %a) { } define <2 x double> @uitofp_2i16_to_2f64(<8 x i16> %a) { -; SSE-LABEL: uitofp_2i16_to_2f64: -; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: uitofp_2i16_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_2i16_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: uitofp_2i16_to_2f64: ; AVX: # %bb.0: @@ -620,12 +745,18 @@ define <2 x double> @uitofp_2i16_to_2f64(<8 x i16> %a) { } define <2 x double> @uitofp_8i16_to_2f64(<8 x i16> %a) { -; SSE-LABEL: uitofp_8i16_to_2f64: -; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: uitofp_8i16_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_8i16_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: retq ; ; VEX-LABEL: uitofp_8i16_to_2f64: ; VEX: # %bb.0: @@ -646,13 +777,19 @@ define <2 x double> @uitofp_8i16_to_2f64(<8 x i16> %a) { } define <2 x double> @uitofp_2i8_to_2f64(<16 x i8> %a) { -; SSE-LABEL: uitofp_2i8_to_2f64: -; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: uitofp_2i8_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_2i8_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: uitofp_2i8_to_2f64: ; AVX: # %bb.0: @@ -665,13 +802,19 @@ define <2 x double> @uitofp_2i8_to_2f64(<16 x i8> %a) { } define <2 x double> @uitofp_16i8_to_2f64(<16 x i8> %a) { -; SSE-LABEL: uitofp_16i8_to_2f64: -; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: uitofp_16i8_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_16i8_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: retq ; ; VEX-LABEL: uitofp_16i8_to_2f64: ; VEX: # %bb.0: @@ -692,37 +835,55 @@ define <2 x double> @uitofp_16i8_to_2f64(<16 x i8> %a) { } define <4 x double> @uitofp_4i64_to_4f64(<4 x i64> %a) { -; SSE-LABEL: uitofp_4i64_to_4f64: -; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [1127219200,1160773632,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE-NEXT: movapd {{.*#+}} xmm5 = [4503599627370496,1.9342813113834067E+25] -; SSE-NEXT: subpd %xmm5, %xmm2 -; SSE-NEXT: movapd %xmm2, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] -; SSE-NEXT: addpd %xmm2, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: subpd %xmm5, %xmm4 -; SSE-NEXT: movapd %xmm4, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] -; SSE-NEXT: addpd %xmm4, %xmm2 -; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: subpd %xmm5, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] -; SSE-NEXT: addpd %xmm1, %xmm2 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: subpd %xmm5, %xmm4 -; SSE-NEXT: movapd %xmm4, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] -; SSE-NEXT: addpd %xmm4, %xmm1 -; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] -; SSE-NEXT: movapd %xmm2, %xmm1 -; SSE-NEXT: retq +; SSE2-LABEL: uitofp_4i64_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1127219200,1160773632,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE2-NEXT: movapd {{.*#+}} xmm5 = [4503599627370496,1.9342813113834067E+25] +; SSE2-NEXT: subpd %xmm5, %xmm2 +; SSE2-NEXT: movapd %xmm2, %xmm0 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1] +; SSE2-NEXT: addpd %xmm2, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE2-NEXT: subpd %xmm5, %xmm4 +; SSE2-NEXT: movapd %xmm4, %xmm2 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; SSE2-NEXT: addpd %xmm4, %xmm2 +; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE2-NEXT: subpd %xmm5, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm2 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1] +; SSE2-NEXT: addpd %xmm1, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE2-NEXT: subpd %xmm5, %xmm4 +; SSE2-NEXT: movapd %xmm4, %xmm1 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] +; SSE2-NEXT: addpd %xmm4, %xmm1 +; SSE2-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] +; SSE2-NEXT: movapd %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_4i64_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE41-NEXT: movapd {{.*#+}} xmm4 = [4503599627370496,1.9342813113834067E+25] +; SSE41-NEXT: subpd %xmm4, %xmm0 +; SSE41-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE41-NEXT: subpd %xmm4, %xmm3 +; SSE41-NEXT: haddpd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; SSE41-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE41-NEXT: subpd %xmm4, %xmm1 +; SSE41-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE41-NEXT: subpd %xmm4, %xmm3 +; SSE41-NEXT: haddpd %xmm3, %xmm1 +; SSE41-NEXT: retq ; ; VEX-LABEL: uitofp_4i64_to_4f64: ; VEX: # %bb.0: @@ -792,25 +953,45 @@ define <4 x double> @uitofp_4i64_to_4f64(<4 x i64> %a) { } define <4 x double> @uitofp_4i32_to_4f64(<4 x i32> %a) { -; SSE-LABEL: uitofp_4i32_to_4f64: -; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE-NEXT: movapd {{.*#+}} xmm2 = [65536,65536] -; SSE-NEXT: mulpd %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE-NEXT: addpd %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: cvtdq2pd %xmm1, %xmm4 -; SSE-NEXT: mulpd %xmm2, %xmm4 -; SSE-NEXT: pand {{.*}}(%rip), %xmm3 -; SSE-NEXT: cvtdq2pd %xmm3, %xmm1 -; SSE-NEXT: addpd %xmm4, %xmm1 -; SSE-NEXT: retq +; SSE2-LABEL: uitofp_4i32_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE2-NEXT: movapd {{.*#+}} xmm2 = [65536,65536] +; SSE2-NEXT: mulpd %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: addpd %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm4 +; SSE2-NEXT: mulpd %xmm2, %xmm4 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 +; SSE2-NEXT: cvtdq2pd %xmm3, %xmm1 +; SSE2-NEXT: addpd %xmm4, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_4i32_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: movapd {{.*#+}} xmm2 = [65536,65536] +; SSE41-NEXT: mulpd %xmm2, %xmm1 +; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7] +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: addpd %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3,4,5,6,7] +; SSE41-NEXT: psrld $16, %xmm4 +; SSE41-NEXT: cvtdq2pd %xmm4, %xmm4 +; SSE41-NEXT: mulpd %xmm2, %xmm4 +; SSE41-NEXT: cvtdq2pd %xmm3, %xmm1 +; SSE41-NEXT: addpd %xmm4, %xmm1 +; SSE41-NEXT: retq ; ; AVX1-LABEL: uitofp_4i32_to_4f64: ; AVX1: # %bb.0: @@ -863,15 +1044,23 @@ define <4 x double> @uitofp_4i32_to_4f64(<4 x i32> %a) { } define <4 x double> @uitofp_4i16_to_4f64(<8 x i16> %a) { -; SSE-LABEL: uitofp_4i16_to_4f64: -; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: cvtdq2pd %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE-NEXT: cvtdq2pd %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: uitofp_4i16_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_4i16_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: retq ; ; AVX-LABEL: uitofp_4i16_to_4f64: ; AVX: # %bb.0: @@ -884,15 +1073,23 @@ define <4 x double> @uitofp_4i16_to_4f64(<8 x i16> %a) { } define <4 x double> @uitofp_8i16_to_4f64(<8 x i16> %a) { -; SSE-LABEL: uitofp_8i16_to_4f64: -; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: cvtdq2pd %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE-NEXT: cvtdq2pd %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: uitofp_8i16_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_8i16_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: retq ; ; VEX-LABEL: uitofp_8i16_to_4f64: ; VEX: # %bb.0: @@ -912,16 +1109,24 @@ define <4 x double> @uitofp_8i16_to_4f64(<8 x i16> %a) { } define <4 x double> @uitofp_4i8_to_4f64(<16 x i8> %a) { -; SSE-LABEL: uitofp_4i8_to_4f64: -; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: cvtdq2pd %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE-NEXT: cvtdq2pd %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: uitofp_4i8_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_4i8_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: retq ; ; AVX-LABEL: uitofp_4i8_to_4f64: ; AVX: # %bb.0: @@ -934,16 +1139,24 @@ define <4 x double> @uitofp_4i8_to_4f64(<16 x i8> %a) { } define <4 x double> @uitofp_16i8_to_4f64(<16 x i8> %a) { -; SSE-LABEL: uitofp_16i8_to_4f64: -; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: cvtdq2pd %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE-NEXT: cvtdq2pd %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: uitofp_16i8_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_16i8_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: retq ; ; VEX-LABEL: uitofp_16i8_to_4f64: ; VEX: # %bb.0: @@ -967,17 +1180,27 @@ define <4 x double> @uitofp_16i8_to_4f64(<16 x i8> %a) { ; define <4 x float> @sitofp_2i64_to_4f32(<2 x i64> %a) { -; SSE-LABEL: sitofp_2i64_to_4f32: -; SSE: # %bb.0: -; SSE-NEXT: movq %xmm0, %rax -; SSE-NEXT: cvtsi2ssq %rax, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE-NEXT: movq %xmm0, %rax -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: cvtsi2ssq %rax, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: sitofp_2i64_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_2i64_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; SSE41-NEXT: retq ; ; VEX-LABEL: sitofp_2i64_to_4f32: ; VEX: # %bb.0: @@ -1030,18 +1253,29 @@ define <4 x float> @sitofp_2i64_to_4f32(<2 x i64> %a) { } define <4 x float> @sitofp_2i64_to_4f32_zero(<2 x i64> %a) { -; SSE-LABEL: sitofp_2i64_to_4f32_zero: -; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: movq %xmm1, %rax -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: cvtsi2ssq %rax, %xmm1 -; SSE-NEXT: movq %xmm0, %rax -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: cvtsi2ssq %rax, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero -; SSE-NEXT: retq +; SSE2-LABEL: sitofp_2i64_to_4f32_zero: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_2i64_to_4f32_zero: +; SSE41: # %bb.0: +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm0[0],zero,zero +; SSE41-NEXT: movaps %xmm1, %xmm0 +; SSE41-NEXT: retq ; ; VEX-LABEL: sitofp_2i64_to_4f32_zero: ; VEX: # %bb.0: @@ -1089,20 +1323,33 @@ define <4 x float> @sitofp_2i64_to_4f32_zero(<2 x i64> %a) { } define <4 x float> @sitofp_4i64_to_4f32_undef(<2 x i64> %a) { -; SSE-LABEL: sitofp_4i64_to_4f32_undef: -; SSE: # %bb.0: -; SSE-NEXT: movq %xmm0, %rax -; SSE-NEXT: cvtsi2ssq %rax, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE-NEXT: movq %xmm0, %rax -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: cvtsi2ssq %rax, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: cvtsi2ssq %rax, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,0] -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: sitofp_4i64_to_4f32_undef: +; SSE2: # %bb.0: +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,0] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_4i64_to_4f32_undef: +; SSE41: # %bb.0: +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] +; SSE41-NEXT: retq ; ; VEX-LABEL: sitofp_4i64_to_4f32_undef: ; VEX: # %bb.0: @@ -1171,12 +1418,18 @@ define <4 x float> @sitofp_4i32_to_4f32(<4 x i32> %a) { } define <4 x float> @sitofp_4i16_to_4f32(<8 x i16> %a) { -; SSE-LABEL: sitofp_4i16_to_4f32: -; SSE: # %bb.0: -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: sitofp_4i16_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_4i16_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxwd %xmm0, %xmm0 +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: sitofp_4i16_to_4f32: ; AVX: # %bb.0: @@ -1189,12 +1442,18 @@ define <4 x float> @sitofp_4i16_to_4f32(<8 x i16> %a) { } define <4 x float> @sitofp_8i16_to_4f32(<8 x i16> %a) { -; SSE-LABEL: sitofp_8i16_to_4f32: -; SSE: # %bb.0: -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: sitofp_8i16_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_8i16_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxwd %xmm0, %xmm0 +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE41-NEXT: retq ; ; AVX1-LABEL: sitofp_8i16_to_4f32: ; AVX1: # %bb.0: @@ -1228,13 +1487,19 @@ define <4 x float> @sitofp_8i16_to_4f32(<8 x i16> %a) { } define <4 x float> @sitofp_4i8_to_4f32(<16 x i8> %a) { -; SSE-LABEL: sitofp_4i8_to_4f32: -; SSE: # %bb.0: -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE-NEXT: psrad $24, %xmm0 -; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: sitofp_4i8_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_4i8_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbd %xmm0, %xmm0 +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: sitofp_4i8_to_4f32: ; AVX: # %bb.0: @@ -1247,13 +1512,19 @@ define <4 x float> @sitofp_4i8_to_4f32(<16 x i8> %a) { } define <4 x float> @sitofp_16i8_to_4f32(<16 x i8> %a) { -; SSE-LABEL: sitofp_16i8_to_4f32: -; SSE: # %bb.0: -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE-NEXT: psrad $24, %xmm0 -; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: sitofp_16i8_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_16i8_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbd %xmm0, %xmm0 +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE41-NEXT: retq ; ; AVX1-LABEL: sitofp_16i8_to_4f32: ; AVX1: # %bb.0: @@ -1287,26 +1558,44 @@ define <4 x float> @sitofp_16i8_to_4f32(<16 x i8> %a) { } define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) { -; SSE-LABEL: sitofp_4i64_to_4f32: -; SSE: # %bb.0: -; SSE-NEXT: movq %xmm1, %rax -; SSE-NEXT: cvtsi2ssq %rax, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE-NEXT: movq %xmm1, %rax -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: cvtsi2ssq %rax, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movq %xmm0, %rax -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: cvtsi2ssq %rax, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE-NEXT: movq %xmm0, %rax -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: cvtsi2ssq %rax, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: sitofp_4i64_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_4i64_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; SSE41-NEXT: movq %xmm1, %rax +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; SSE41-NEXT: pextrq $1, %xmm1, %rax +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; SSE41-NEXT: retq ; ; AVX1-LABEL: sitofp_4i64_to_4f32: ; AVX1: # %bb.0: @@ -1409,16 +1698,26 @@ define <8 x float> @sitofp_8i32_to_8f32(<8 x i32> %a) { } define <8 x float> @sitofp_8i16_to_8f32(<8 x i16> %a) { -; SSE-LABEL: sitofp_8i16_to_8f32: -; SSE: # %bb.0: -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: psrad $16, %xmm1 -; SSE-NEXT: cvtdq2ps %xmm1, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: cvtdq2ps %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: sitofp_8i16_to_8f32: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: cvtdq2ps %xmm1, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_8i16_to_8f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxwd %xmm0, %xmm1 +; SSE41-NEXT: cvtdq2ps %xmm1, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovsxwd %xmm0, %xmm0 +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: retq ; ; AVX1-LABEL: sitofp_8i16_to_8f32: ; AVX1: # %bb.0: @@ -1445,19 +1744,29 @@ define <8 x float> @sitofp_8i16_to_8f32(<8 x i16> %a) { } define <8 x float> @sitofp_8i8_to_8f32(<16 x i8> %a) { -; SSE-LABEL: sitofp_8i8_to_8f32: -; SSE: # %bb.0: -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] -; SSE-NEXT: psrad $24, %xmm1 -; SSE-NEXT: cvtdq2ps %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE-NEXT: psrad $24, %xmm0 -; SSE-NEXT: cvtdq2ps %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: sitofp_8i8_to_8f32: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm1 +; SSE2-NEXT: cvtdq2ps %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_8i8_to_8f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbd %xmm0, %xmm1 +; SSE41-NEXT: cvtdq2ps %xmm1, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE41-NEXT: pmovsxbd %xmm0, %xmm0 +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: retq ; ; AVX1-LABEL: sitofp_8i8_to_8f32: ; AVX1: # %bb.0: @@ -1485,19 +1794,29 @@ define <8 x float> @sitofp_8i8_to_8f32(<16 x i8> %a) { } define <8 x float> @sitofp_16i8_to_8f32(<16 x i8> %a) { -; SSE-LABEL: sitofp_16i8_to_8f32: -; SSE: # %bb.0: -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] -; SSE-NEXT: psrad $24, %xmm1 -; SSE-NEXT: cvtdq2ps %xmm1, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE-NEXT: psrad $24, %xmm0 -; SSE-NEXT: cvtdq2ps %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: sitofp_16i8_to_8f32: +; SSE2: # %bb.0: +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm1 +; SSE2-NEXT: cvtdq2ps %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_16i8_to_8f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbd %xmm0, %xmm1 +; SSE41-NEXT: cvtdq2ps %xmm1, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE41-NEXT: pmovsxbd %xmm0, %xmm0 +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: retq ; ; AVX1-LABEL: sitofp_16i8_to_8f32: ; AVX1: # %bb.0: @@ -1530,44 +1849,79 @@ define <8 x float> @sitofp_16i8_to_8f32(<16 x i8> %a) { ; define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) { -; SSE-LABEL: uitofp_2i64_to_4f32: -; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movq %xmm0, %rax -; SSE-NEXT: testq %rax, %rax -; SSE-NEXT: js .LBB39_1 -; SSE-NEXT: # %bb.2: -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: cvtsi2ssq %rax, %xmm0 -; SSE-NEXT: jmp .LBB39_3 -; SSE-NEXT: .LBB39_1: -; SSE-NEXT: movq %rax, %rcx -; SSE-NEXT: shrq %rcx -; SSE-NEXT: andl $1, %eax -; SSE-NEXT: orq %rcx, %rax -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: cvtsi2ssq %rax, %xmm0 -; SSE-NEXT: addss %xmm0, %xmm0 -; SSE-NEXT: .LBB39_3: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE-NEXT: movq %xmm1, %rax -; SSE-NEXT: testq %rax, %rax -; SSE-NEXT: js .LBB39_4 -; SSE-NEXT: # %bb.5: -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: cvtsi2ssq %rax, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: retq -; SSE-NEXT: .LBB39_4: -; SSE-NEXT: movq %rax, %rcx -; SSE-NEXT: shrq %rcx -; SSE-NEXT: andl $1, %eax -; SSE-NEXT: orq %rcx, %rax -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: cvtsi2ssq %rax, %xmm1 -; SSE-NEXT: addss %xmm1, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: retq +; SSE2-LABEL: uitofp_2i64_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB39_1 +; SSE2-NEXT: # %bb.2: +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE2-NEXT: jmp .LBB39_3 +; SSE2-NEXT: .LBB39_1: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE2-NEXT: addss %xmm0, %xmm0 +; SSE2-NEXT: .LBB39_3: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB39_4 +; SSE2-NEXT: # %bb.5: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; SSE2-NEXT: .LBB39_4: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE2-NEXT: addss %xmm1, %xmm1 +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_2i64_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB39_1 +; SSE41-NEXT: # %bb.2: +; SSE41-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE41-NEXT: jmp .LBB39_3 +; SSE41-NEXT: .LBB39_1: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE41-NEXT: addss %xmm1, %xmm1 +; SSE41-NEXT: .LBB39_3: +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB39_4 +; SSE41-NEXT: # %bb.5: +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; SSE41-NEXT: retq +; SSE41-NEXT: .LBB39_4: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE41-NEXT: addss %xmm0, %xmm0 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; SSE41-NEXT: retq ; ; VEX-LABEL: uitofp_2i64_to_4f32: ; VEX: # %bb.0: @@ -1649,44 +2003,82 @@ define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) { } define <4 x float> @uitofp_2i64_to_2f32(<2 x i64> %a) { -; SSE-LABEL: uitofp_2i64_to_2f32: -; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: movq %xmm1, %rax -; SSE-NEXT: testq %rax, %rax -; SSE-NEXT: js .LBB40_1 -; SSE-NEXT: # %bb.2: -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: cvtsi2ssq %rax, %xmm1 -; SSE-NEXT: jmp .LBB40_3 -; SSE-NEXT: .LBB40_1: -; SSE-NEXT: movq %rax, %rcx -; SSE-NEXT: shrq %rcx -; SSE-NEXT: andl $1, %eax -; SSE-NEXT: orq %rcx, %rax -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: cvtsi2ssq %rax, %xmm1 -; SSE-NEXT: addss %xmm1, %xmm1 -; SSE-NEXT: .LBB40_3: -; SSE-NEXT: movq %xmm0, %rax -; SSE-NEXT: testq %rax, %rax -; SSE-NEXT: js .LBB40_4 -; SSE-NEXT: # %bb.5: -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: cvtsi2ssq %rax, %xmm0 -; SSE-NEXT: jmp .LBB40_6 -; SSE-NEXT: .LBB40_4: -; SSE-NEXT: movq %rax, %rcx -; SSE-NEXT: shrq %rcx -; SSE-NEXT: andl $1, %eax -; SSE-NEXT: orq %rcx, %rax -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: cvtsi2ssq %rax, %xmm0 -; SSE-NEXT: addss %xmm0, %xmm0 -; SSE-NEXT: .LBB40_6: -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero -; SSE-NEXT: retq +; SSE2-LABEL: uitofp_2i64_to_2f32: +; SSE2: # %bb.0: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB40_1 +; SSE2-NEXT: # %bb.2: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE2-NEXT: jmp .LBB40_3 +; SSE2-NEXT: .LBB40_1: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE2-NEXT: addss %xmm1, %xmm1 +; SSE2-NEXT: .LBB40_3: +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB40_4 +; SSE2-NEXT: # %bb.5: +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE2-NEXT: jmp .LBB40_6 +; SSE2-NEXT: .LBB40_4: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE2-NEXT: addss %xmm0, %xmm0 +; SSE2-NEXT: .LBB40_6: +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_2i64_to_2f32: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB40_1 +; SSE41-NEXT: # %bb.2: +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE41-NEXT: jmp .LBB40_3 +; SSE41-NEXT: .LBB40_1: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE41-NEXT: addss %xmm0, %xmm0 +; SSE41-NEXT: .LBB40_3: +; SSE41-NEXT: pextrq $1, %xmm1, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB40_4 +; SSE41-NEXT: # %bb.5: +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero +; SSE41-NEXT: retq +; SSE41-NEXT: .LBB40_4: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE41-NEXT: addss %xmm1, %xmm1 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero +; SSE41-NEXT: retq ; ; VEX-LABEL: uitofp_2i64_to_2f32: ; VEX: # %bb.0: @@ -1758,52 +2150,95 @@ define <4 x float> @uitofp_2i64_to_2f32(<2 x i64> %a) { } define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) { -; SSE-LABEL: uitofp_4i64_to_4f32_undef: -; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: movq %xmm0, %rax -; SSE-NEXT: testq %rax, %rax -; SSE-NEXT: js .LBB41_1 -; SSE-NEXT: # %bb.2: -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: cvtsi2ssq %rax, %xmm0 -; SSE-NEXT: jmp .LBB41_3 -; SSE-NEXT: .LBB41_1: -; SSE-NEXT: movq %rax, %rcx -; SSE-NEXT: shrq %rcx -; SSE-NEXT: andl $1, %eax -; SSE-NEXT: orq %rcx, %rax -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: cvtsi2ssq %rax, %xmm0 -; SSE-NEXT: addss %xmm0, %xmm0 -; SSE-NEXT: .LBB41_3: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE-NEXT: movq %xmm1, %rax -; SSE-NEXT: testq %rax, %rax -; SSE-NEXT: js .LBB41_4 -; SSE-NEXT: # %bb.5: -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: cvtsi2ssq %rax, %xmm1 -; SSE-NEXT: jmp .LBB41_6 -; SSE-NEXT: .LBB41_4: -; SSE-NEXT: movq %rax, %rcx -; SSE-NEXT: shrq %rcx -; SSE-NEXT: andl $1, %eax -; SSE-NEXT: orq %rcx, %rax -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: cvtsi2ssq %rax, %xmm1 -; SSE-NEXT: addss %xmm1, %xmm1 -; SSE-NEXT: .LBB41_6: -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: testq %rax, %rax -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: js .LBB41_8 -; SSE-NEXT: # %bb.7: -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: cvtsi2ssq %rax, %xmm1 -; SSE-NEXT: .LBB41_8: -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] -; SSE-NEXT: retq +; SSE2-LABEL: uitofp_4i64_to_4f32_undef: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB41_1 +; SSE2-NEXT: # %bb.2: +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE2-NEXT: jmp .LBB41_3 +; SSE2-NEXT: .LBB41_1: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE2-NEXT: addss %xmm0, %xmm0 +; SSE2-NEXT: .LBB41_3: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB41_4 +; SSE2-NEXT: # %bb.5: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE2-NEXT: jmp .LBB41_6 +; SSE2-NEXT: .LBB41_4: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE2-NEXT: addss %xmm1, %xmm1 +; SSE2-NEXT: .LBB41_6: +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: js .LBB41_8 +; SSE2-NEXT: # %bb.7: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE2-NEXT: .LBB41_8: +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_4i64_to_4f32_undef: +; SSE41: # %bb.0: +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB41_1 +; SSE41-NEXT: # %bb.2: +; SSE41-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE41-NEXT: jmp .LBB41_3 +; SSE41-NEXT: .LBB41_1: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE41-NEXT: addss %xmm1, %xmm1 +; SSE41-NEXT: .LBB41_3: +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB41_4 +; SSE41-NEXT: # %bb.5: +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE41-NEXT: jmp .LBB41_6 +; SSE41-NEXT: .LBB41_4: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE41-NEXT: addss %xmm0, %xmm0 +; SSE41-NEXT: .LBB41_6: +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: js .LBB41_8 +; SSE41-NEXT: # %bb.7: +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE41-NEXT: .LBB41_8: +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,0] +; SSE41-NEXT: retq ; ; VEX-LABEL: uitofp_4i64_to_4f32_undef: ; VEX: # %bb.0: @@ -1887,16 +2322,26 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) { } define <4 x float> @uitofp_4i32_to_4f32(<4 x i32> %a) { -; SSE-LABEL: uitofp_4i32_to_4f32: -; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por {{.*}}(%rip), %xmm1 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: por {{.*}}(%rip), %xmm0 -; SSE-NEXT: addps {{.*}}(%rip), %xmm0 -; SSE-NEXT: addps %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: uitofp_4i32_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: por {{.*}}(%rip), %xmm1 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: por {{.*}}(%rip), %xmm0 +; SSE2-NEXT: addps {{.*}}(%rip), %xmm0 +; SSE2-NEXT: addps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_4i32_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1258291200,1258291200,1258291200,1258291200] +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] +; SSE41-NEXT: addps {{.*}}(%rip), %xmm0 +; SSE41-NEXT: addps %xmm1, %xmm0 +; SSE41-NEXT: retq ; ; AVX1-LABEL: uitofp_4i32_to_4f32: ; AVX1: # %bb.0: @@ -1949,12 +2394,18 @@ define <4 x float> @uitofp_4i32_to_4f32(<4 x i32> %a) { } define <4 x float> @uitofp_4i16_to_4f32(<8 x i16> %a) { -; SSE-LABEL: uitofp_4i16_to_4f32: -; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: uitofp_4i16_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_4i16_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: uitofp_4i16_to_4f32: ; AVX: # %bb.0: @@ -1967,12 +2418,18 @@ define <4 x float> @uitofp_4i16_to_4f32(<8 x i16> %a) { } define <4 x float> @uitofp_8i16_to_4f32(<8 x i16> %a) { -; SSE-LABEL: uitofp_8i16_to_4f32: -; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: uitofp_8i16_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_8i16_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE41-NEXT: retq ; ; AVX1-LABEL: uitofp_8i16_to_4f32: ; AVX1: # %bb.0: @@ -2006,13 +2463,19 @@ define <4 x float> @uitofp_8i16_to_4f32(<8 x i16> %a) { } define <4 x float> @uitofp_4i8_to_4f32(<16 x i8> %a) { -; SSE-LABEL: uitofp_4i8_to_4f32: -; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: uitofp_4i8_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_4i8_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: uitofp_4i8_to_4f32: ; AVX: # %bb.0: @@ -2025,13 +2488,19 @@ define <4 x float> @uitofp_4i8_to_4f32(<16 x i8> %a) { } define <4 x float> @uitofp_16i8_to_4f32(<16 x i8> %a) { -; SSE-LABEL: uitofp_16i8_to_4f32: -; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: uitofp_16i8_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_16i8_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE41-NEXT: retq ; ; AVX1-LABEL: uitofp_16i8_to_4f32: ; AVX1: # %bb.0: @@ -2065,75 +2534,144 @@ define <4 x float> @uitofp_16i8_to_4f32(<16 x i8> %a) { } define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) { -; SSE-LABEL: uitofp_4i64_to_4f32: -; SSE: # %bb.0: -; SSE-NEXT: movq %xmm1, %rax -; SSE-NEXT: testq %rax, %rax -; SSE-NEXT: js .LBB47_1 -; SSE-NEXT: # %bb.2: -; SSE-NEXT: cvtsi2ssq %rax, %xmm2 -; SSE-NEXT: jmp .LBB47_3 -; SSE-NEXT: .LBB47_1: -; SSE-NEXT: movq %rax, %rcx -; SSE-NEXT: shrq %rcx -; SSE-NEXT: andl $1, %eax -; SSE-NEXT: orq %rcx, %rax -; SSE-NEXT: cvtsi2ssq %rax, %xmm2 -; SSE-NEXT: addss %xmm2, %xmm2 -; SSE-NEXT: .LBB47_3: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE-NEXT: movq %xmm1, %rax -; SSE-NEXT: testq %rax, %rax -; SSE-NEXT: js .LBB47_4 -; SSE-NEXT: # %bb.5: -; SSE-NEXT: cvtsi2ssq %rax, %xmm3 -; SSE-NEXT: jmp .LBB47_6 -; SSE-NEXT: .LBB47_4: -; SSE-NEXT: movq %rax, %rcx -; SSE-NEXT: shrq %rcx -; SSE-NEXT: andl $1, %eax -; SSE-NEXT: orq %rcx, %rax -; SSE-NEXT: cvtsi2ssq %rax, %xmm3 -; SSE-NEXT: addss %xmm3, %xmm3 -; SSE-NEXT: .LBB47_6: -; SSE-NEXT: movq %xmm0, %rax -; SSE-NEXT: testq %rax, %rax -; SSE-NEXT: js .LBB47_7 -; SSE-NEXT: # %bb.8: -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: cvtsi2ssq %rax, %xmm1 -; SSE-NEXT: jmp .LBB47_9 -; SSE-NEXT: .LBB47_7: -; SSE-NEXT: movq %rax, %rcx -; SSE-NEXT: shrq %rcx -; SSE-NEXT: andl $1, %eax -; SSE-NEXT: orq %rcx, %rax -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: cvtsi2ssq %rax, %xmm1 -; SSE-NEXT: addss %xmm1, %xmm1 -; SSE-NEXT: .LBB47_9: -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE-NEXT: movq %xmm0, %rax -; SSE-NEXT: testq %rax, %rax -; SSE-NEXT: js .LBB47_10 -; SSE-NEXT: # %bb.11: -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: cvtsi2ssq %rax, %xmm0 -; SSE-NEXT: jmp .LBB47_12 -; SSE-NEXT: .LBB47_10: -; SSE-NEXT: movq %rax, %rcx -; SSE-NEXT: shrq %rcx -; SSE-NEXT: andl $1, %eax -; SSE-NEXT: orq %rcx, %rax -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: cvtsi2ssq %rax, %xmm0 -; SSE-NEXT: addss %xmm0, %xmm0 -; SSE-NEXT: .LBB47_12: -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE-NEXT: movaps %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: uitofp_4i64_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB47_1 +; SSE2-NEXT: # %bb.2: +; SSE2-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE2-NEXT: jmp .LBB47_3 +; SSE2-NEXT: .LBB47_1: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE2-NEXT: addss %xmm2, %xmm2 +; SSE2-NEXT: .LBB47_3: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB47_4 +; SSE2-NEXT: # %bb.5: +; SSE2-NEXT: cvtsi2ssq %rax, %xmm3 +; SSE2-NEXT: jmp .LBB47_6 +; SSE2-NEXT: .LBB47_4: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: cvtsi2ssq %rax, %xmm3 +; SSE2-NEXT: addss %xmm3, %xmm3 +; SSE2-NEXT: .LBB47_6: +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB47_7 +; SSE2-NEXT: # %bb.8: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE2-NEXT: jmp .LBB47_9 +; SSE2-NEXT: .LBB47_7: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE2-NEXT: addss %xmm1, %xmm1 +; SSE2-NEXT: .LBB47_9: +; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB47_10 +; SSE2-NEXT: # %bb.11: +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE2-NEXT: jmp .LBB47_12 +; SSE2-NEXT: .LBB47_10: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE2-NEXT: addss %xmm0, %xmm0 +; SSE2-NEXT: .LBB47_12: +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_4i64_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB47_1 +; SSE41-NEXT: # %bb.2: +; SSE41-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE41-NEXT: jmp .LBB47_3 +; SSE41-NEXT: .LBB47_1: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE41-NEXT: addss %xmm2, %xmm2 +; SSE41-NEXT: .LBB47_3: +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB47_4 +; SSE41-NEXT: # %bb.5: +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE41-NEXT: jmp .LBB47_6 +; SSE41-NEXT: .LBB47_4: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE41-NEXT: addss %xmm0, %xmm0 +; SSE41-NEXT: .LBB47_6: +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; SSE41-NEXT: movq %xmm1, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB47_7 +; SSE41-NEXT: # %bb.8: +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE41-NEXT: jmp .LBB47_9 +; SSE41-NEXT: .LBB47_7: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE41-NEXT: addss %xmm2, %xmm2 +; SSE41-NEXT: .LBB47_9: +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; SSE41-NEXT: pextrq $1, %xmm1, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB47_10 +; SSE41-NEXT: # %bb.11: +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; SSE41-NEXT: retq +; SSE41-NEXT: .LBB47_10: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE41-NEXT: addss %xmm1, %xmm1 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; SSE41-NEXT: retq ; ; AVX1-LABEL: uitofp_4i64_to_4f32: ; AVX1: # %bb.0: @@ -2319,26 +2857,44 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) { } define <8 x float> @uitofp_8i32_to_8f32(<8 x i32> %a) { -; SSE-LABEL: uitofp_8i32_to_8f32: -; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [1258291200,1258291200,1258291200,1258291200] -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [1392508928,1392508928,1392508928,1392508928] -; SSE-NEXT: por %xmm5, %xmm0 -; SSE-NEXT: movaps {{.*#+}} xmm6 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11] -; SSE-NEXT: addps %xmm6, %xmm0 -; SSE-NEXT: addps %xmm3, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: por %xmm5, %xmm1 -; SSE-NEXT: addps %xmm6, %xmm1 -; SSE-NEXT: addps %xmm2, %xmm1 -; SSE-NEXT: retq +; SSE2-LABEL: uitofp_8i32_to_8f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1258291200,1258291200,1258291200,1258291200] +; SSE2-NEXT: por %xmm4, %xmm3 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1392508928,1392508928,1392508928,1392508928] +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: movaps {{.*#+}} xmm6 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11] +; SSE2-NEXT: addps %xmm6, %xmm0 +; SSE2-NEXT: addps %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: addps %xmm6, %xmm1 +; SSE2-NEXT: addps %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_8i32_to_8f32: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1258291200,1258291200,1258291200,1258291200] +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1392508928,1392508928,1392508928,1392508928] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7] +; SSE41-NEXT: movaps {{.*#+}} xmm5 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11] +; SSE41-NEXT: addps %xmm5, %xmm0 +; SSE41-NEXT: addps %xmm3, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7] +; SSE41-NEXT: addps %xmm5, %xmm1 +; SSE41-NEXT: addps %xmm2, %xmm1 +; SSE41-NEXT: retq ; ; AVX1-LABEL: uitofp_8i32_to_8f32: ; AVX1: # %bb.0: @@ -2393,16 +2949,26 @@ define <8 x float> @uitofp_8i32_to_8f32(<8 x i32> %a) { } define <8 x float> @uitofp_8i16_to_8f32(<8 x i16> %a) { -; SSE-LABEL: uitofp_8i16_to_8f32: -; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: cvtdq2ps %xmm2, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: cvtdq2ps %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: uitofp_8i16_to_8f32: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: cvtdq2ps %xmm2, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_8i16_to_8f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: cvtdq2ps %xmm1, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: retq ; ; AVX1-LABEL: uitofp_8i16_to_8f32: ; AVX1: # %bb.0: @@ -2429,17 +2995,27 @@ define <8 x float> @uitofp_8i16_to_8f32(<8 x i16> %a) { } define <8 x float> @uitofp_8i8_to_8f32(<16 x i8> %a) { -; SSE-LABEL: uitofp_8i8_to_8f32: -; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: cvtdq2ps %xmm2, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: cvtdq2ps %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: uitofp_8i8_to_8f32: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: cvtdq2ps %xmm2, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_8i8_to_8f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: cvtdq2ps %xmm1, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: retq ; ; AVX1-LABEL: uitofp_8i8_to_8f32: ; AVX1: # %bb.0: @@ -2467,17 +3043,27 @@ define <8 x float> @uitofp_8i8_to_8f32(<16 x i8> %a) { } define <8 x float> @uitofp_16i8_to_8f32(<16 x i8> %a) { -; SSE-LABEL: uitofp_16i8_to_8f32: -; SSE: # %bb.0: -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE-NEXT: cvtdq2ps %xmm2, %xmm2 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: cvtdq2ps %xmm0, %xmm1 -; SSE-NEXT: movaps %xmm2, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: uitofp_16i8_to_8f32: +; SSE2: # %bb.0: +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: cvtdq2ps %xmm2, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm1 +; SSE2-NEXT: movaps %xmm2, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_16i8_to_8f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: cvtdq2ps %xmm1, %xmm2 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm1 +; SSE41-NEXT: movaps %xmm2, %xmm0 +; SSE41-NEXT: retq ; ; AVX1-LABEL: uitofp_16i8_to_8f32: ; AVX1: # %bb.0: @@ -2510,17 +3096,28 @@ define <8 x float> @uitofp_16i8_to_8f32(<16 x i8> %a) { ; define <2 x double> @sitofp_load_2i64_to_2f64(<2 x i64> *%a) { -; SSE-LABEL: sitofp_load_2i64_to_2f64: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm1 -; SSE-NEXT: movq %xmm1, %rax -; SSE-NEXT: cvtsi2sdq %rax, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE-NEXT: movq %xmm1, %rax -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: cvtsi2sdq %rax, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: retq +; SSE2-LABEL: sitofp_load_2i64_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: cvtsi2sdq %rax, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2sdq %rax, %xmm1 +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_load_2i64_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: cvtsi2sdq %rax, %xmm1 +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2sdq %rax, %xmm0 +; SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE41-NEXT: retq ; ; VEX-LABEL: sitofp_load_2i64_to_2f64: ; VEX: # %bb.0: @@ -2585,13 +3182,20 @@ define <2 x double> @sitofp_load_2i32_to_2f64(<2 x i32> *%a) { } define <2 x double> @sitofp_load_2i16_to_2f64(<2 x i16> *%a) { -; SSE-LABEL: sitofp_load_2i16_to_2f64: -; SSE: # %bb.0: -; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] -; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: sitofp_load_2i16_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_load_2i16_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxwq (%rdi), %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: sitofp_load_2i16_to_2f64: ; AVX: # %bb.0: @@ -2605,15 +3209,22 @@ define <2 x double> @sitofp_load_2i16_to_2f64(<2 x i16> *%a) { } define <2 x double> @sitofp_load_2i8_to_2f64(<2 x i8> *%a) { -; SSE-LABEL: sitofp_load_2i8_to_2f64: -; SSE: # %bb.0: -; SSE-NEXT: movzwl (%rdi), %eax -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE-NEXT: psrad $24, %xmm0 -; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: sitofp_load_2i8_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movzwl (%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_load_2i8_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbq (%rdi), %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: sitofp_load_2i8_to_2f64: ; AVX: # %bb.0: @@ -2627,26 +3238,45 @@ define <2 x double> @sitofp_load_2i8_to_2f64(<2 x i8> *%a) { } define <4 x double> @sitofp_load_4i64_to_4f64(<4 x i64> *%a) { -; SSE-LABEL: sitofp_load_4i64_to_4f64: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm1 -; SSE-NEXT: movdqa 16(%rdi), %xmm2 -; SSE-NEXT: movq %xmm1, %rax -; SSE-NEXT: cvtsi2sdq %rax, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE-NEXT: movq %xmm1, %rax -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: cvtsi2sdq %rax, %xmm1 -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: movq %xmm2, %rax -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: cvtsi2sdq %rax, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; SSE-NEXT: movq %xmm2, %rax -; SSE-NEXT: xorps %xmm2, %xmm2 -; SSE-NEXT: cvtsi2sdq %rax, %xmm2 -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE-NEXT: retq +; SSE2-LABEL: sitofp_load_4i64_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: movdqa 16(%rdi), %xmm2 +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: cvtsi2sdq %rax, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2sdq %rax, %xmm1 +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2sdq %rax, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: cvtsi2sdq %rax, %xmm2 +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_load_4i64_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: movdqa 16(%rdi), %xmm1 +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: cvtsi2sdq %rax, %xmm2 +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2sdq %rax, %xmm0 +; SSE41-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE41-NEXT: pextrq $1, %xmm1, %rax +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: cvtsi2sdq %rax, %xmm2 +; SSE41-NEXT: movq %xmm1, %rax +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: cvtsi2sdq %rax, %xmm1 +; SSE41-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE41-NEXT: retq ; ; AVX1-LABEL: sitofp_load_4i64_to_4f64: ; AVX1: # %bb.0: @@ -2751,15 +3381,23 @@ define <4 x double> @sitofp_load_4i32_to_4f64(<4 x i32> *%a) { } define <4 x double> @sitofp_load_4i16_to_4f64(<4 x i16> *%a) { -; SSE-LABEL: sitofp_load_4i16_to_4f64: -; SSE: # %bb.0: -; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: psrad $16, %xmm1 -; SSE-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE-NEXT: retq +; SSE2-LABEL: sitofp_load_4i16_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_load_4i16_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxwd (%rdi), %xmm1 +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: retq ; ; AVX-LABEL: sitofp_load_4i16_to_4f64: ; AVX: # %bb.0: @@ -2772,16 +3410,24 @@ define <4 x double> @sitofp_load_4i16_to_4f64(<4 x i16> *%a) { } define <4 x double> @sitofp_load_4i8_to_4f64(<4 x i8> *%a) { -; SSE-LABEL: sitofp_load_4i8_to_4f64: -; SSE: # %bb.0: -; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: psrad $24, %xmm1 -; SSE-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE-NEXT: retq +; SSE2-LABEL: sitofp_load_4i8_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: psrad $24, %xmm1 +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_load_4i8_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbd (%rdi), %xmm1 +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: retq ; ; AVX-LABEL: sitofp_load_4i8_to_4f64: ; AVX: # %bb.0: @@ -2798,24 +3444,37 @@ define <4 x double> @sitofp_load_4i8_to_4f64(<4 x i8> *%a) { ; define <2 x double> @uitofp_load_2i64_to_2f64(<2 x i64> *%a) { -; SSE-LABEL: uitofp_load_2i64_to_2f64: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: movapd {{.*#+}} xmm4 = [4503599627370496,1.9342813113834067E+25] -; SSE-NEXT: subpd %xmm4, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: addpd %xmm1, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; SSE-NEXT: subpd %xmm4, %xmm3 -; SSE-NEXT: movapd %xmm3, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] -; SSE-NEXT: addpd %xmm3, %xmm1 -; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: retq +; SSE2-LABEL: uitofp_load_2i64_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: movapd {{.*#+}} xmm4 = [4503599627370496,1.9342813113834067E+25] +; SSE2-NEXT: subpd %xmm4, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE2-NEXT: addpd %xmm1, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE2-NEXT: subpd %xmm4, %xmm3 +; SSE2-NEXT: movapd %xmm3, %xmm1 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1] +; SSE2-NEXT: addpd %xmm3, %xmm1 +; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_load_2i64_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1127219200,1160773632,0,0] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE41-NEXT: movapd {{.*#+}} xmm3 = [4503599627370496,1.9342813113834067E+25] +; SSE41-NEXT: subpd %xmm3, %xmm0 +; SSE41-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE41-NEXT: subpd %xmm3, %xmm2 +; SSE41-NEXT: haddpd %xmm2, %xmm0 +; SSE41-NEXT: retq ; ; VEX-LABEL: uitofp_load_2i64_to_2f64: ; VEX: # %bb.0: @@ -2868,17 +3527,29 @@ define <2 x double> @uitofp_load_2i64_to_2f64(<2 x i64> *%a) { } define <2 x double> @uitofp_load_2i32_to_2f64(<2 x i32> *%a) { -; SSE-LABEL: uitofp_load_2i32_to_2f64: -; SSE: # %bb.0: -; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,0,0] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE-NEXT: mulpd {{.*}}(%rip), %xmm0 -; SSE-NEXT: addpd %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: uitofp_load_2i32_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,0,65535,0,0,0,0,0] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: mulpd {{.*}}(%rip), %xmm0 +; SSE2-NEXT: addpd %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_load_2i32_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4,5,6,7] +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: mulpd {{.*}}(%rip), %xmm0 +; SSE41-NEXT: addpd %xmm1, %xmm0 +; SSE41-NEXT: retq ; ; VEX-LABEL: uitofp_load_2i32_to_2f64: ; VEX: # %bb.0: @@ -2923,13 +3594,20 @@ define <2 x double> @uitofp_load_2i32_to_2f64(<2 x i32> *%a) { } define <2 x double> @uitofp_load_2i16_to_2f64(<2 x i16> *%a) { -; SSE-LABEL: uitofp_load_2i16_to_2f64: -; SSE: # %bb.0: -; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: uitofp_load_2i16_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_load_2i16_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: uitofp_load_2i16_to_2f64: ; AVX: # %bb.0: @@ -2943,15 +3621,22 @@ define <2 x double> @uitofp_load_2i16_to_2f64(<2 x i16> *%a) { } define <2 x double> @uitofp_load_2i8_to_2f64(<2 x i8> *%a) { -; SSE-LABEL: uitofp_load_2i8_to_2f64: -; SSE: # %bb.0: -; SSE-NEXT: movzwl (%rdi), %eax -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: uitofp_load_2i8_to_2f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movzwl (%rdi), %eax +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_load_2i8_to_2f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: uitofp_load_2i8_to_2f64: ; AVX: # %bb.0: @@ -2965,37 +3650,57 @@ define <2 x double> @uitofp_load_2i8_to_2f64(<2 x i8> *%a) { } define <4 x double> @uitofp_load_4i64_to_4f64(<4 x i64> *%a) { -; SSE-LABEL: uitofp_load_4i64_to_4f64: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm1 -; SSE-NEXT: movdqa 16(%rdi), %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm3 = [1127219200,1160773632,0,0] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: movapd {{.*#+}} xmm5 = [4503599627370496,1.9342813113834067E+25] -; SSE-NEXT: subpd %xmm5, %xmm1 -; SSE-NEXT: movapd %xmm1, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] -; SSE-NEXT: addpd %xmm1, %xmm0 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: subpd %xmm5, %xmm4 -; SSE-NEXT: movapd %xmm4, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] -; SSE-NEXT: addpd %xmm4, %xmm1 -; SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE-NEXT: subpd %xmm5, %xmm2 -; SSE-NEXT: movapd %xmm2, %xmm1 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] -; SSE-NEXT: addpd %xmm2, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] -; SSE-NEXT: subpd %xmm5, %xmm4 -; SSE-NEXT: movapd %xmm4, %xmm2 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] -; SSE-NEXT: addpd %xmm4, %xmm2 -; SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE-NEXT: retq +; SSE2-LABEL: uitofp_load_4i64_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: movdqa 16(%rdi), %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1127219200,1160773632,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE2-NEXT: movapd {{.*#+}} xmm5 = [4503599627370496,1.9342813113834067E+25] +; SSE2-NEXT: subpd %xmm5, %xmm1 +; SSE2-NEXT: movapd %xmm1, %xmm0 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] +; SSE2-NEXT: addpd %xmm1, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE2-NEXT: subpd %xmm5, %xmm4 +; SSE2-NEXT: movapd %xmm4, %xmm1 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1] +; SSE2-NEXT: addpd %xmm4, %xmm1 +; SSE2-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,0,1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE2-NEXT: subpd %xmm5, %xmm2 +; SSE2-NEXT: movapd %xmm2, %xmm1 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] +; SSE2-NEXT: addpd %xmm2, %xmm1 +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE2-NEXT: subpd %xmm5, %xmm4 +; SSE2-NEXT: movapd %xmm4, %xmm2 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1] +; SSE2-NEXT: addpd %xmm4, %xmm2 +; SSE2-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_load_4i64_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: movdqa 16(%rdi), %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; SSE41-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE41-NEXT: movapd {{.*#+}} xmm4 = [4503599627370496,1.9342813113834067E+25] +; SSE41-NEXT: subpd %xmm4, %xmm0 +; SSE41-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE41-NEXT: subpd %xmm4, %xmm3 +; SSE41-NEXT: haddpd %xmm3, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; SSE41-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE41-NEXT: subpd %xmm4, %xmm1 +; SSE41-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE41-NEXT: subpd %xmm4, %xmm3 +; SSE41-NEXT: haddpd %xmm3, %xmm1 +; SSE41-NEXT: retq ; ; VEX-LABEL: uitofp_load_4i64_to_4f64: ; VEX: # %bb.0: @@ -3069,26 +3774,47 @@ define <4 x double> @uitofp_load_4i64_to_4f64(<4 x i64> *%a) { } define <4 x double> @uitofp_load_4i32_to_4f64(<4 x i32> *%a) { -; SSE-LABEL: uitofp_load_4i32_to_4f64: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE-NEXT: movapd {{.*#+}} xmm2 = [65536,65536] -; SSE-NEXT: mulpd %xmm2, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] -; SSE-NEXT: pand {{.*}}(%rip), %xmm0 -; SSE-NEXT: cvtdq2pd %xmm0, %xmm0 -; SSE-NEXT: addpd %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm1 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: cvtdq2pd %xmm1, %xmm4 -; SSE-NEXT: mulpd %xmm2, %xmm4 -; SSE-NEXT: pand {{.*}}(%rip), %xmm3 -; SSE-NEXT: cvtdq2pd %xmm3, %xmm1 -; SSE-NEXT: addpd %xmm4, %xmm1 -; SSE-NEXT: retq +; SSE2-LABEL: uitofp_load_4i32_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE2-NEXT: movapd {{.*#+}} xmm2 = [65536,65536] +; SSE2-NEXT: mulpd %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] +; SSE2-NEXT: pand {{.*}}(%rip), %xmm0 +; SSE2-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE2-NEXT: addpd %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm4 +; SSE2-NEXT: mulpd %xmm2, %xmm4 +; SSE2-NEXT: pand {{.*}}(%rip), %xmm3 +; SSE2-NEXT: cvtdq2pd %xmm3, %xmm1 +; SSE2-NEXT: addpd %xmm4, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_load_4i32_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: movapd {{.*#+}} xmm2 = [65536,65536] +; SSE41-NEXT: mulpd %xmm2, %xmm1 +; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7] +; SSE41-NEXT: cvtdq2pd %xmm0, %xmm0 +; SSE41-NEXT: addpd %xmm1, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3,4,5,6,7] +; SSE41-NEXT: psrld $16, %xmm4 +; SSE41-NEXT: cvtdq2pd %xmm4, %xmm4 +; SSE41-NEXT: mulpd %xmm2, %xmm4 +; SSE41-NEXT: cvtdq2pd %xmm3, %xmm1 +; SSE41-NEXT: addpd %xmm4, %xmm1 +; SSE41-NEXT: retq ; ; AVX1-LABEL: uitofp_load_4i32_to_4f64: ; AVX1: # %bb.0: @@ -3144,15 +3870,23 @@ define <4 x double> @uitofp_load_4i32_to_4f64(<4 x i32> *%a) { } define <4 x double> @uitofp_load_4i16_to_4f64(<4 x i16> *%a) { -; SSE-LABEL: uitofp_load_4i16_to_4f64: -; SSE: # %bb.0: -; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE-NEXT: retq +; SSE2-LABEL: uitofp_load_4i16_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_load_4i16_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: retq ; ; AVX-LABEL: uitofp_load_4i16_to_4f64: ; AVX: # %bb.0: @@ -3165,16 +3899,24 @@ define <4 x double> @uitofp_load_4i16_to_4f64(<4 x i16> *%a) { } define <4 x double> @uitofp_load_4i8_to_4f64(<4 x i8> *%a) { -; SSE-LABEL: uitofp_load_4i8_to_4f64: -; SSE: # %bb.0: -; SSE-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: cvtdq2pd %xmm1, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE-NEXT: cvtdq2pd %xmm1, %xmm1 -; SSE-NEXT: retq +; SSE2-LABEL: uitofp_load_4i8_to_4f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_load_4i8_to_4f64: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE41-NEXT: cvtdq2pd %xmm1, %xmm1 +; SSE41-NEXT: retq ; ; AVX-LABEL: uitofp_load_4i8_to_4f64: ; AVX: # %bb.0: @@ -3191,27 +3933,47 @@ define <4 x double> @uitofp_load_4i8_to_4f64(<4 x i8> *%a) { ; define <4 x float> @sitofp_load_4i64_to_4f32(<4 x i64> *%a) { -; SSE-LABEL: sitofp_load_4i64_to_4f32: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm1 -; SSE-NEXT: movdqa 16(%rdi), %xmm0 -; SSE-NEXT: movq %xmm0, %rax -; SSE-NEXT: cvtsi2ssq %rax, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE-NEXT: movq %xmm0, %rax -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: cvtsi2ssq %rax, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: movq %xmm1, %rax -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: cvtsi2ssq %rax, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE-NEXT: movq %xmm1, %rax -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: cvtsi2ssq %rax, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE-NEXT: retq +; SSE2-LABEL: sitofp_load_4i64_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: movdqa 16(%rdi), %xmm0 +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE2-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_load_4i64_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: movdqa 16(%rdi), %xmm1 +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; SSE41-NEXT: movq %xmm1, %rax +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; SSE41-NEXT: pextrq $1, %xmm1, %rax +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; SSE41-NEXT: retq ; ; AVX1-LABEL: sitofp_load_4i64_to_4f32: ; AVX1: # %bb.0: @@ -3318,13 +4080,19 @@ define <4 x float> @sitofp_load_4i32_to_4f32(<4 x i32> *%a) { } define <4 x float> @sitofp_load_4i16_to_4f32(<4 x i16> *%a) { -; SSE-LABEL: sitofp_load_4i16_to_4f32: -; SSE: # %bb.0: -; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: sitofp_load_4i16_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_load_4i16_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxwd (%rdi), %xmm0 +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: sitofp_load_4i16_to_4f32: ; AVX: # %bb.0: @@ -3337,14 +4105,20 @@ define <4 x float> @sitofp_load_4i16_to_4f32(<4 x i16> *%a) { } define <4 x float> @sitofp_load_4i8_to_4f32(<4 x i8> *%a) { -; SSE-LABEL: sitofp_load_4i8_to_4f32: -; SSE: # %bb.0: -; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE-NEXT: psrad $24, %xmm0 -; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: sitofp_load_4i8_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_load_4i8_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbd (%rdi), %xmm0 +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: sitofp_load_4i8_to_4f32: ; AVX: # %bb.0: @@ -3357,46 +4131,83 @@ define <4 x float> @sitofp_load_4i8_to_4f32(<4 x i8> *%a) { } define <8 x float> @sitofp_load_8i64_to_8f32(<8 x i64> *%a) { -; SSE-LABEL: sitofp_load_8i64_to_8f32: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm1 -; SSE-NEXT: movdqa 16(%rdi), %xmm0 -; SSE-NEXT: movdqa 32(%rdi), %xmm2 -; SSE-NEXT: movdqa 48(%rdi), %xmm3 -; SSE-NEXT: movq %xmm0, %rax -; SSE-NEXT: cvtsi2ssq %rax, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE-NEXT: movq %xmm0, %rax -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: cvtsi2ssq %rax, %xmm0 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE-NEXT: movq %xmm1, %rax -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: cvtsi2ssq %rax, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE-NEXT: movq %xmm1, %rax -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: cvtsi2ssq %rax, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] -; SSE-NEXT: movq %xmm3, %rax -; SSE-NEXT: xorps %xmm4, %xmm4 -; SSE-NEXT: cvtsi2ssq %rax, %xmm4 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] -; SSE-NEXT: movq %xmm1, %rax -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: cvtsi2ssq %rax, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] -; SSE-NEXT: movq %xmm2, %rax -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: cvtsi2ssq %rax, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; SSE-NEXT: movq %xmm2, %rax -; SSE-NEXT: xorps %xmm2, %xmm2 -; SSE-NEXT: cvtsi2ssq %rax, %xmm2 -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] -; SSE-NEXT: retq +; SSE2-LABEL: sitofp_load_8i64_to_8f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: movdqa 16(%rdi), %xmm0 +; SSE2-NEXT: movdqa 32(%rdi), %xmm2 +; SSE2-NEXT: movdqa 48(%rdi), %xmm3 +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: cvtsi2ssq %rax, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE2-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0] +; SSE2-NEXT: movq %xmm3, %rax +; SSE2-NEXT: xorps %xmm4, %xmm4 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE2-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm4[0] +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_load_8i64_to_8f32: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: movdqa 16(%rdi), %xmm1 +; SSE41-NEXT: movdqa 32(%rdi), %xmm2 +; SSE41-NEXT: movdqa 48(%rdi), %xmm3 +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: cvtsi2ssq %rax, %xmm4 +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3] +; SSE41-NEXT: movq %xmm1, %rax +; SSE41-NEXT: xorps %xmm4, %xmm4 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm4 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0],xmm0[3] +; SSE41-NEXT: pextrq $1, %xmm1, %rax +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; SSE41-NEXT: pextrq $1, %xmm2, %rax +; SSE41-NEXT: xorps %xmm4, %xmm4 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm4 +; SSE41-NEXT: movq %xmm2, %rax +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[2,3] +; SSE41-NEXT: movq %xmm3, %rax +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; SSE41-NEXT: pextrq $1, %xmm3, %rax +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] +; SSE41-NEXT: retq ; ; AVX1-LABEL: sitofp_load_8i64_to_8f32: ; AVX1: # %bb.0: @@ -3553,17 +4364,25 @@ define <8 x float> @sitofp_load_8i32_to_8f32(<8 x i32> *%a) { } define <8 x float> @sitofp_load_8i16_to_8f32(<8 x i16> *%a) { -; SSE-LABEL: sitofp_load_8i16_to_8f32: -; SSE: # %bb.0: -; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: psrad $16, %xmm1 -; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE-NEXT: cvtdq2ps %xmm1, %xmm1 -; SSE-NEXT: retq +; SSE2-LABEL: sitofp_load_8i16_to_8f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE2-NEXT: cvtdq2ps %xmm1, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_load_8i16_to_8f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxwd 8(%rdi), %xmm1 +; SSE41-NEXT: pmovsxwd (%rdi), %xmm0 +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE41-NEXT: cvtdq2ps %xmm1, %xmm1 +; SSE41-NEXT: retq ; ; AVX1-LABEL: sitofp_load_8i16_to_8f32: ; AVX1: # %bb.0: @@ -3590,19 +4409,27 @@ define <8 x float> @sitofp_load_8i16_to_8f32(<8 x i16> *%a) { } define <8 x float> @sitofp_load_8i8_to_8f32(<8 x i8> *%a) { -; SSE-LABEL: sitofp_load_8i8_to_8f32: -; SSE: # %bb.0: -; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: psrad $24, %xmm1 -; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE-NEXT: psrad $24, %xmm0 -; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE-NEXT: cvtdq2ps %xmm1, %xmm1 -; SSE-NEXT: retq +; SSE2-LABEL: sitofp_load_8i8_to_8f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: psrad $24, %xmm1 +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSE2-NEXT: psrad $24, %xmm0 +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE2-NEXT: cvtdq2ps %xmm1, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: sitofp_load_8i8_to_8f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovsxbd 4(%rdi), %xmm1 +; SSE41-NEXT: pmovsxbd (%rdi), %xmm0 +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE41-NEXT: cvtdq2ps %xmm1, %xmm1 +; SSE41-NEXT: retq ; ; AVX1-LABEL: sitofp_load_8i8_to_8f32: ; AVX1: # %bb.0: @@ -3635,76 +4462,147 @@ define <8 x float> @sitofp_load_8i8_to_8f32(<8 x i8> *%a) { ; define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { -; SSE-LABEL: uitofp_load_4i64_to_4f32: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm2 -; SSE-NEXT: movdqa 16(%rdi), %xmm0 -; SSE-NEXT: movq %xmm0, %rax -; SSE-NEXT: testq %rax, %rax -; SSE-NEXT: js .LBB76_1 -; SSE-NEXT: # %bb.2: -; SSE-NEXT: cvtsi2ssq %rax, %xmm1 -; SSE-NEXT: jmp .LBB76_3 -; SSE-NEXT: .LBB76_1: -; SSE-NEXT: movq %rax, %rcx -; SSE-NEXT: shrq %rcx -; SSE-NEXT: andl $1, %eax -; SSE-NEXT: orq %rcx, %rax -; SSE-NEXT: cvtsi2ssq %rax, %xmm1 -; SSE-NEXT: addss %xmm1, %xmm1 -; SSE-NEXT: .LBB76_3: -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE-NEXT: movq %xmm0, %rax -; SSE-NEXT: testq %rax, %rax -; SSE-NEXT: js .LBB76_4 -; SSE-NEXT: # %bb.5: -; SSE-NEXT: cvtsi2ssq %rax, %xmm3 -; SSE-NEXT: jmp .LBB76_6 -; SSE-NEXT: .LBB76_4: -; SSE-NEXT: movq %rax, %rcx -; SSE-NEXT: shrq %rcx -; SSE-NEXT: andl $1, %eax -; SSE-NEXT: orq %rcx, %rax -; SSE-NEXT: cvtsi2ssq %rax, %xmm3 -; SSE-NEXT: addss %xmm3, %xmm3 -; SSE-NEXT: .LBB76_6: -; SSE-NEXT: movq %xmm2, %rax -; SSE-NEXT: testq %rax, %rax -; SSE-NEXT: js .LBB76_7 -; SSE-NEXT: # %bb.8: -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: cvtsi2ssq %rax, %xmm0 -; SSE-NEXT: jmp .LBB76_9 -; SSE-NEXT: .LBB76_7: -; SSE-NEXT: movq %rax, %rcx -; SSE-NEXT: shrq %rcx -; SSE-NEXT: andl $1, %eax -; SSE-NEXT: orq %rcx, %rax -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: cvtsi2ssq %rax, %xmm0 -; SSE-NEXT: addss %xmm0, %xmm0 -; SSE-NEXT: .LBB76_9: -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; SSE-NEXT: movq %xmm2, %rax -; SSE-NEXT: testq %rax, %rax -; SSE-NEXT: js .LBB76_10 -; SSE-NEXT: # %bb.11: -; SSE-NEXT: xorps %xmm2, %xmm2 -; SSE-NEXT: cvtsi2ssq %rax, %xmm2 -; SSE-NEXT: jmp .LBB76_12 -; SSE-NEXT: .LBB76_10: -; SSE-NEXT: movq %rax, %rcx -; SSE-NEXT: shrq %rcx -; SSE-NEXT: andl $1, %eax -; SSE-NEXT: orq %rcx, %rax -; SSE-NEXT: xorps %xmm2, %xmm2 -; SSE-NEXT: cvtsi2ssq %rax, %xmm2 -; SSE-NEXT: addss %xmm2, %xmm2 -; SSE-NEXT: .LBB76_12: -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE-NEXT: retq +; SSE2-LABEL: uitofp_load_4i64_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm2 +; SSE2-NEXT: movdqa 16(%rdi), %xmm0 +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB76_1 +; SSE2-NEXT: # %bb.2: +; SSE2-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE2-NEXT: jmp .LBB76_3 +; SSE2-NEXT: .LBB76_1: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE2-NEXT: addss %xmm1, %xmm1 +; SSE2-NEXT: .LBB76_3: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB76_4 +; SSE2-NEXT: # %bb.5: +; SSE2-NEXT: cvtsi2ssq %rax, %xmm3 +; SSE2-NEXT: jmp .LBB76_6 +; SSE2-NEXT: .LBB76_4: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: cvtsi2ssq %rax, %xmm3 +; SSE2-NEXT: addss %xmm3, %xmm3 +; SSE2-NEXT: .LBB76_6: +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB76_7 +; SSE2-NEXT: # %bb.8: +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE2-NEXT: jmp .LBB76_9 +; SSE2-NEXT: .LBB76_7: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE2-NEXT: addss %xmm0, %xmm0 +; SSE2-NEXT: .LBB76_9: +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB76_10 +; SSE2-NEXT: # %bb.11: +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE2-NEXT: jmp .LBB76_12 +; SSE2-NEXT: .LBB76_10: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE2-NEXT: addss %xmm2, %xmm2 +; SSE2-NEXT: .LBB76_12: +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_load_4i64_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: movdqa 16(%rdi), %xmm1 +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB76_1 +; SSE41-NEXT: # %bb.2: +; SSE41-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE41-NEXT: jmp .LBB76_3 +; SSE41-NEXT: .LBB76_1: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE41-NEXT: addss %xmm2, %xmm2 +; SSE41-NEXT: .LBB76_3: +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB76_4 +; SSE41-NEXT: # %bb.5: +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE41-NEXT: jmp .LBB76_6 +; SSE41-NEXT: .LBB76_4: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE41-NEXT: addss %xmm0, %xmm0 +; SSE41-NEXT: .LBB76_6: +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] +; SSE41-NEXT: movq %xmm1, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB76_7 +; SSE41-NEXT: # %bb.8: +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE41-NEXT: jmp .LBB76_9 +; SSE41-NEXT: .LBB76_7: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE41-NEXT: addss %xmm2, %xmm2 +; SSE41-NEXT: .LBB76_9: +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] +; SSE41-NEXT: pextrq $1, %xmm1, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB76_10 +; SSE41-NEXT: # %bb.11: +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; SSE41-NEXT: retq +; SSE41-NEXT: .LBB76_10: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE41-NEXT: addss %xmm1, %xmm1 +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; SSE41-NEXT: retq ; ; AVX1-LABEL: uitofp_load_4i64_to_4f32: ; AVX1: # %bb.0: @@ -3894,17 +4792,28 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) { } define <4 x float> @uitofp_load_4i32_to_4f32(<4 x i32> *%a) { -; SSE-LABEL: uitofp_load_4i32_to_4f32: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] -; SSE-NEXT: pand %xmm0, %xmm1 -; SSE-NEXT: por {{.*}}(%rip), %xmm1 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: por {{.*}}(%rip), %xmm0 -; SSE-NEXT: addps {{.*}}(%rip), %xmm0 -; SSE-NEXT: addps %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: uitofp_load_4i32_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: por {{.*}}(%rip), %xmm1 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: por {{.*}}(%rip), %xmm0 +; SSE2-NEXT: addps {{.*}}(%rip), %xmm0 +; SSE2-NEXT: addps %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_load_4i32_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [1258291200,1258291200,1258291200,1258291200] +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7] +; SSE41-NEXT: addps {{.*}}(%rip), %xmm0 +; SSE41-NEXT: addps %xmm1, %xmm0 +; SSE41-NEXT: retq ; ; AVX1-LABEL: uitofp_load_4i32_to_4f32: ; AVX1: # %bb.0: @@ -3960,13 +4869,19 @@ define <4 x float> @uitofp_load_4i32_to_4f32(<4 x i32> *%a) { } define <4 x float> @uitofp_load_4i16_to_4f32(<4 x i16> *%a) { -; SSE-LABEL: uitofp_load_4i16_to_4f32: -; SSE: # %bb.0: -; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: uitofp_load_4i16_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_load_4i16_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: uitofp_load_4i16_to_4f32: ; AVX: # %bb.0: @@ -3979,14 +4894,20 @@ define <4 x float> @uitofp_load_4i16_to_4f32(<4 x i16> *%a) { } define <4 x float> @uitofp_load_4i8_to_4f32(<4 x i8> *%a) { -; SSE-LABEL: uitofp_load_4i8_to_4f32: -; SSE: # %bb.0: -; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: uitofp_load_4i8_to_4f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_load_4i8_to_4f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: uitofp_load_4i8_to_4f32: ; AVX: # %bb.0: @@ -3999,143 +4920,279 @@ define <4 x float> @uitofp_load_4i8_to_4f32(<4 x i8> *%a) { } define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { -; SSE-LABEL: uitofp_load_8i64_to_8f32: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm5 -; SSE-NEXT: movdqa 16(%rdi), %xmm0 -; SSE-NEXT: movdqa 32(%rdi), %xmm2 -; SSE-NEXT: movdqa 48(%rdi), %xmm1 -; SSE-NEXT: movq %xmm0, %rax -; SSE-NEXT: testq %rax, %rax -; SSE-NEXT: js .LBB80_1 -; SSE-NEXT: # %bb.2: -; SSE-NEXT: cvtsi2ssq %rax, %xmm3 -; SSE-NEXT: jmp .LBB80_3 -; SSE-NEXT: .LBB80_1: -; SSE-NEXT: movq %rax, %rcx -; SSE-NEXT: shrq %rcx -; SSE-NEXT: andl $1, %eax -; SSE-NEXT: orq %rcx, %rax -; SSE-NEXT: cvtsi2ssq %rax, %xmm3 -; SSE-NEXT: addss %xmm3, %xmm3 -; SSE-NEXT: .LBB80_3: -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; SSE-NEXT: movq %xmm0, %rax -; SSE-NEXT: testq %rax, %rax -; SSE-NEXT: js .LBB80_4 -; SSE-NEXT: # %bb.5: -; SSE-NEXT: cvtsi2ssq %rax, %xmm4 -; SSE-NEXT: jmp .LBB80_6 -; SSE-NEXT: .LBB80_4: -; SSE-NEXT: movq %rax, %rcx -; SSE-NEXT: shrq %rcx -; SSE-NEXT: andl $1, %eax -; SSE-NEXT: orq %rcx, %rax -; SSE-NEXT: cvtsi2ssq %rax, %xmm4 -; SSE-NEXT: addss %xmm4, %xmm4 -; SSE-NEXT: .LBB80_6: -; SSE-NEXT: movq %xmm5, %rax -; SSE-NEXT: testq %rax, %rax -; SSE-NEXT: js .LBB80_7 -; SSE-NEXT: # %bb.8: -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: cvtsi2ssq %rax, %xmm0 -; SSE-NEXT: jmp .LBB80_9 -; SSE-NEXT: .LBB80_7: -; SSE-NEXT: movq %rax, %rcx -; SSE-NEXT: shrq %rcx -; SSE-NEXT: andl $1, %eax -; SSE-NEXT: orq %rcx, %rax -; SSE-NEXT: xorps %xmm0, %xmm0 -; SSE-NEXT: cvtsi2ssq %rax, %xmm0 -; SSE-NEXT: addss %xmm0, %xmm0 -; SSE-NEXT: .LBB80_9: -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1] -; SSE-NEXT: movq %xmm5, %rax -; SSE-NEXT: testq %rax, %rax -; SSE-NEXT: js .LBB80_10 -; SSE-NEXT: # %bb.11: -; SSE-NEXT: cvtsi2ssq %rax, %xmm6 -; SSE-NEXT: jmp .LBB80_12 -; SSE-NEXT: .LBB80_10: -; SSE-NEXT: movq %rax, %rcx -; SSE-NEXT: shrq %rcx -; SSE-NEXT: andl $1, %eax -; SSE-NEXT: orq %rcx, %rax -; SSE-NEXT: cvtsi2ssq %rax, %xmm6 -; SSE-NEXT: addss %xmm6, %xmm6 -; SSE-NEXT: .LBB80_12: -; SSE-NEXT: movq %xmm1, %rax -; SSE-NEXT: testq %rax, %rax -; SSE-NEXT: js .LBB80_13 -; SSE-NEXT: # %bb.14: -; SSE-NEXT: xorps %xmm5, %xmm5 -; SSE-NEXT: cvtsi2ssq %rax, %xmm5 -; SSE-NEXT: jmp .LBB80_15 -; SSE-NEXT: .LBB80_13: -; SSE-NEXT: movq %rax, %rcx -; SSE-NEXT: shrq %rcx -; SSE-NEXT: andl $1, %eax -; SSE-NEXT: orq %rcx, %rax -; SSE-NEXT: xorps %xmm5, %xmm5 -; SSE-NEXT: cvtsi2ssq %rax, %xmm5 -; SSE-NEXT: addss %xmm5, %xmm5 -; SSE-NEXT: .LBB80_15: -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE-NEXT: movq %xmm1, %rax -; SSE-NEXT: testq %rax, %rax -; SSE-NEXT: js .LBB80_16 -; SSE-NEXT: # %bb.17: -; SSE-NEXT: cvtsi2ssq %rax, %xmm7 -; SSE-NEXT: jmp .LBB80_18 -; SSE-NEXT: .LBB80_16: -; SSE-NEXT: movq %rax, %rcx -; SSE-NEXT: shrq %rcx -; SSE-NEXT: andl $1, %eax -; SSE-NEXT: orq %rcx, %rax -; SSE-NEXT: cvtsi2ssq %rax, %xmm7 -; SSE-NEXT: addss %xmm7, %xmm7 -; SSE-NEXT: .LBB80_18: -; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE-NEXT: movq %xmm2, %rax -; SSE-NEXT: testq %rax, %rax -; SSE-NEXT: js .LBB80_19 -; SSE-NEXT: # %bb.20: -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: cvtsi2ssq %rax, %xmm1 -; SSE-NEXT: jmp .LBB80_21 -; SSE-NEXT: .LBB80_19: -; SSE-NEXT: movq %rax, %rcx -; SSE-NEXT: shrq %rcx -; SSE-NEXT: andl $1, %eax -; SSE-NEXT: orq %rcx, %rax -; SSE-NEXT: xorps %xmm1, %xmm1 -; SSE-NEXT: cvtsi2ssq %rax, %xmm1 -; SSE-NEXT: addss %xmm1, %xmm1 -; SSE-NEXT: .LBB80_21: -; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; SSE-NEXT: movq %xmm2, %rax -; SSE-NEXT: testq %rax, %rax -; SSE-NEXT: js .LBB80_22 -; SSE-NEXT: # %bb.23: -; SSE-NEXT: xorps %xmm2, %xmm2 -; SSE-NEXT: cvtsi2ssq %rax, %xmm2 -; SSE-NEXT: jmp .LBB80_24 -; SSE-NEXT: .LBB80_22: -; SSE-NEXT: movq %rax, %rcx -; SSE-NEXT: shrq %rcx -; SSE-NEXT: andl $1, %eax -; SSE-NEXT: orq %rcx, %rax -; SSE-NEXT: xorps %xmm2, %xmm2 -; SSE-NEXT: cvtsi2ssq %rax, %xmm2 -; SSE-NEXT: addss %xmm2, %xmm2 -; SSE-NEXT: .LBB80_24: -; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] -; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] -; SSE-NEXT: retq +; SSE2-LABEL: uitofp_load_8i64_to_8f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm5 +; SSE2-NEXT: movdqa 16(%rdi), %xmm0 +; SSE2-NEXT: movdqa 32(%rdi), %xmm2 +; SSE2-NEXT: movdqa 48(%rdi), %xmm1 +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB80_1 +; SSE2-NEXT: # %bb.2: +; SSE2-NEXT: cvtsi2ssq %rax, %xmm3 +; SSE2-NEXT: jmp .LBB80_3 +; SSE2-NEXT: .LBB80_1: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: cvtsi2ssq %rax, %xmm3 +; SSE2-NEXT: addss %xmm3, %xmm3 +; SSE2-NEXT: .LBB80_3: +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE2-NEXT: movq %xmm0, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB80_4 +; SSE2-NEXT: # %bb.5: +; SSE2-NEXT: cvtsi2ssq %rax, %xmm4 +; SSE2-NEXT: jmp .LBB80_6 +; SSE2-NEXT: .LBB80_4: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: cvtsi2ssq %rax, %xmm4 +; SSE2-NEXT: addss %xmm4, %xmm4 +; SSE2-NEXT: .LBB80_6: +; SSE2-NEXT: movq %xmm5, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB80_7 +; SSE2-NEXT: # %bb.8: +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE2-NEXT: jmp .LBB80_9 +; SSE2-NEXT: .LBB80_7: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE2-NEXT: addss %xmm0, %xmm0 +; SSE2-NEXT: .LBB80_9: +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1] +; SSE2-NEXT: movq %xmm5, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB80_10 +; SSE2-NEXT: # %bb.11: +; SSE2-NEXT: cvtsi2ssq %rax, %xmm6 +; SSE2-NEXT: jmp .LBB80_12 +; SSE2-NEXT: .LBB80_10: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: cvtsi2ssq %rax, %xmm6 +; SSE2-NEXT: addss %xmm6, %xmm6 +; SSE2-NEXT: .LBB80_12: +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB80_13 +; SSE2-NEXT: # %bb.14: +; SSE2-NEXT: xorps %xmm5, %xmm5 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm5 +; SSE2-NEXT: jmp .LBB80_15 +; SSE2-NEXT: .LBB80_13: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: xorps %xmm5, %xmm5 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm5 +; SSE2-NEXT: addss %xmm5, %xmm5 +; SSE2-NEXT: .LBB80_15: +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: movq %xmm1, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB80_16 +; SSE2-NEXT: # %bb.17: +; SSE2-NEXT: cvtsi2ssq %rax, %xmm7 +; SSE2-NEXT: jmp .LBB80_18 +; SSE2-NEXT: .LBB80_16: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: cvtsi2ssq %rax, %xmm7 +; SSE2-NEXT: addss %xmm7, %xmm7 +; SSE2-NEXT: .LBB80_18: +; SSE2-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB80_19 +; SSE2-NEXT: # %bb.20: +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE2-NEXT: jmp .LBB80_21 +; SSE2-NEXT: .LBB80_19: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE2-NEXT: addss %xmm1, %xmm1 +; SSE2-NEXT: .LBB80_21: +; SSE2-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0] +; SSE2-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: testq %rax, %rax +; SSE2-NEXT: js .LBB80_22 +; SSE2-NEXT: # %bb.23: +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE2-NEXT: jmp .LBB80_24 +; SSE2-NEXT: .LBB80_22: +; SSE2-NEXT: movq %rax, %rcx +; SSE2-NEXT: shrq %rcx +; SSE2-NEXT: andl $1, %eax +; SSE2-NEXT: orq %rcx, %rax +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE2-NEXT: addss %xmm2, %xmm2 +; SSE2-NEXT: .LBB80_24: +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0] +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_load_8i64_to_8f32: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: movdqa 16(%rdi), %xmm4 +; SSE41-NEXT: movdqa 32(%rdi), %xmm1 +; SSE41-NEXT: movdqa 48(%rdi), %xmm2 +; SSE41-NEXT: pextrq $1, %xmm0, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB80_1 +; SSE41-NEXT: # %bb.2: +; SSE41-NEXT: cvtsi2ssq %rax, %xmm3 +; SSE41-NEXT: jmp .LBB80_3 +; SSE41-NEXT: .LBB80_1: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: cvtsi2ssq %rax, %xmm3 +; SSE41-NEXT: addss %xmm3, %xmm3 +; SSE41-NEXT: .LBB80_3: +; SSE41-NEXT: movq %xmm0, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB80_4 +; SSE41-NEXT: # %bb.5: +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE41-NEXT: jmp .LBB80_6 +; SSE41-NEXT: .LBB80_4: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: xorps %xmm0, %xmm0 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm0 +; SSE41-NEXT: addss %xmm0, %xmm0 +; SSE41-NEXT: .LBB80_6: +; SSE41-NEXT: movq %xmm4, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB80_7 +; SSE41-NEXT: # %bb.8: +; SSE41-NEXT: cvtsi2ssq %rax, %xmm5 +; SSE41-NEXT: jmp .LBB80_9 +; SSE41-NEXT: .LBB80_7: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: cvtsi2ssq %rax, %xmm5 +; SSE41-NEXT: addss %xmm5, %xmm5 +; SSE41-NEXT: .LBB80_9: +; SSE41-NEXT: pextrq $1, %xmm4, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB80_10 +; SSE41-NEXT: # %bb.11: +; SSE41-NEXT: xorps %xmm4, %xmm4 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm4 +; SSE41-NEXT: jmp .LBB80_12 +; SSE41-NEXT: .LBB80_10: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: xorps %xmm4, %xmm4 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm4 +; SSE41-NEXT: addss %xmm4, %xmm4 +; SSE41-NEXT: .LBB80_12: +; SSE41-NEXT: pextrq $1, %xmm1, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB80_13 +; SSE41-NEXT: # %bb.14: +; SSE41-NEXT: cvtsi2ssq %rax, %xmm6 +; SSE41-NEXT: jmp .LBB80_15 +; SSE41-NEXT: .LBB80_13: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: cvtsi2ssq %rax, %xmm6 +; SSE41-NEXT: addss %xmm6, %xmm6 +; SSE41-NEXT: .LBB80_15: +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] +; SSE41-NEXT: movq %xmm1, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB80_16 +; SSE41-NEXT: # %bb.17: +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE41-NEXT: jmp .LBB80_18 +; SSE41-NEXT: .LBB80_16: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: xorps %xmm1, %xmm1 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm1 +; SSE41-NEXT: addss %xmm1, %xmm1 +; SSE41-NEXT: .LBB80_18: +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[2,3] +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm5[0],xmm0[3] +; SSE41-NEXT: movq %xmm2, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB80_19 +; SSE41-NEXT: # %bb.20: +; SSE41-NEXT: xorps %xmm3, %xmm3 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm3 +; SSE41-NEXT: jmp .LBB80_21 +; SSE41-NEXT: .LBB80_19: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: xorps %xmm3, %xmm3 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm3 +; SSE41-NEXT: addss %xmm3, %xmm3 +; SSE41-NEXT: .LBB80_21: +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] +; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0] +; SSE41-NEXT: pextrq $1, %xmm2, %rax +; SSE41-NEXT: testq %rax, %rax +; SSE41-NEXT: js .LBB80_22 +; SSE41-NEXT: # %bb.23: +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] +; SSE41-NEXT: retq +; SSE41-NEXT: .LBB80_22: +; SSE41-NEXT: movq %rax, %rcx +; SSE41-NEXT: shrq %rcx +; SSE41-NEXT: andl $1, %eax +; SSE41-NEXT: orq %rcx, %rax +; SSE41-NEXT: xorps %xmm2, %xmm2 +; SSE41-NEXT: cvtsi2ssq %rax, %xmm2 +; SSE41-NEXT: addss %xmm2, %xmm2 +; SSE41-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] +; SSE41-NEXT: retq ; ; AVX1-LABEL: uitofp_load_8i64_to_8f32: ; AVX1: # %bb.0: @@ -4468,28 +5525,48 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { } define <8 x float> @uitofp_load_8i32_to_8f32(<8 x i32> *%a) { -; SSE-LABEL: uitofp_load_8i32_to_8f32: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm0 -; SSE-NEXT: movdqa 16(%rdi), %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pand %xmm2, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm4 = [1258291200,1258291200,1258291200,1258291200] -; SSE-NEXT: por %xmm4, %xmm3 -; SSE-NEXT: psrld $16, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm5 = [1392508928,1392508928,1392508928,1392508928] -; SSE-NEXT: por %xmm5, %xmm0 -; SSE-NEXT: movaps {{.*#+}} xmm6 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11] -; SSE-NEXT: addps %xmm6, %xmm0 -; SSE-NEXT: addps %xmm3, %xmm0 -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: por %xmm4, %xmm2 -; SSE-NEXT: psrld $16, %xmm1 -; SSE-NEXT: por %xmm5, %xmm1 -; SSE-NEXT: addps %xmm6, %xmm1 -; SSE-NEXT: addps %xmm2, %xmm1 -; SSE-NEXT: retq +; SSE2-LABEL: uitofp_load_8i32_to_8f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa 16(%rdi), %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: pand %xmm2, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1258291200,1258291200,1258291200,1258291200] +; SSE2-NEXT: por %xmm4, %xmm3 +; SSE2-NEXT: psrld $16, %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [1392508928,1392508928,1392508928,1392508928] +; SSE2-NEXT: por %xmm5, %xmm0 +; SSE2-NEXT: movaps {{.*#+}} xmm6 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11] +; SSE2-NEXT: addps %xmm6, %xmm0 +; SSE2-NEXT: addps %xmm3, %xmm0 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: por %xmm4, %xmm2 +; SSE2-NEXT: psrld $16, %xmm1 +; SSE2-NEXT: por %xmm5, %xmm1 +; SSE2-NEXT: addps %xmm6, %xmm1 +; SSE2-NEXT: addps %xmm2, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_load_8i32_to_8f32: +; SSE41: # %bb.0: +; SSE41-NEXT: movdqa (%rdi), %xmm0 +; SSE41-NEXT: movdqa 16(%rdi), %xmm1 +; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [1258291200,1258291200,1258291200,1258291200] +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] +; SSE41-NEXT: psrld $16, %xmm0 +; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [1392508928,1392508928,1392508928,1392508928] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7] +; SSE41-NEXT: movaps {{.*#+}} xmm5 = [-5.49764202E+11,-5.49764202E+11,-5.49764202E+11,-5.49764202E+11] +; SSE41-NEXT: addps %xmm5, %xmm0 +; SSE41-NEXT: addps %xmm3, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; SSE41-NEXT: psrld $16, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7] +; SSE41-NEXT: addps %xmm5, %xmm1 +; SSE41-NEXT: addps %xmm2, %xmm1 +; SSE41-NEXT: retq ; ; AVX1-LABEL: uitofp_load_8i32_to_8f32: ; AVX1: # %bb.0: @@ -4547,16 +5624,24 @@ define <8 x float> @uitofp_load_8i32_to_8f32(<8 x i32> *%a) { } define <8 x float> @uitofp_load_8i16_to_8f32(<8 x i16> *%a) { -; SSE-LABEL: uitofp_load_8i16_to_8f32: -; SSE: # %bb.0: -; SSE-NEXT: movdqa (%rdi), %xmm1 -; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: cvtdq2ps %xmm1, %xmm1 -; SSE-NEXT: retq +; SSE2-LABEL: uitofp_load_8i16_to_8f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: cvtdq2ps %xmm1, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_load_8i16_to_8f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE41-NEXT: cvtdq2ps %xmm1, %xmm1 +; SSE41-NEXT: retq ; ; AVX1-LABEL: uitofp_load_8i16_to_8f32: ; AVX1: # %bb.0: @@ -4583,17 +5668,25 @@ define <8 x float> @uitofp_load_8i16_to_8f32(<8 x i16> *%a) { } define <8 x float> @uitofp_load_8i8_to_8f32(<8 x i8> *%a) { -; SSE-LABEL: uitofp_load_8i8_to_8f32: -; SSE: # %bb.0: -; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: pxor %xmm2, %xmm2 -; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] -; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; SSE-NEXT: cvtdq2ps %xmm1, %xmm1 -; SSE-NEXT: retq +; SSE2-LABEL: uitofp_load_8i8_to_8f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] +; SSE2-NEXT: cvtdq2ps %xmm1, %xmm1 +; SSE2-NEXT: retq +; +; SSE41-LABEL: uitofp_load_8i8_to_8f32: +; SSE41: # %bb.0: +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE41-NEXT: cvtdq2ps %xmm1, %xmm1 +; SSE41-NEXT: retq ; ; AVX1-LABEL: uitofp_load_8i8_to_8f32: ; AVX1: # %bb.0: @@ -4625,19 +5718,32 @@ define <8 x float> @uitofp_load_8i8_to_8f32(<8 x i8> *%a) { %Arguments = type <{ <8 x i8>, <8 x i16>, <8 x float>* }> define void @aggregate_sitofp_8i16_to_8f32(%Arguments* nocapture readonly %a0) { -; SSE-LABEL: aggregate_sitofp_8i16_to_8f32: -; SSE: # %bb.0: -; SSE-NEXT: movq 24(%rdi), %rax -; SSE-NEXT: movdqu 8(%rdi), %xmm0 -; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE-NEXT: psrad $16, %xmm1 -; SSE-NEXT: cvtdq2ps %xmm1, %xmm1 -; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] -; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 -; SSE-NEXT: movaps %xmm0, 16(%rax) -; SSE-NEXT: movaps %xmm1, (%rax) -; SSE-NEXT: retq +; SSE2-LABEL: aggregate_sitofp_8i16_to_8f32: +; SSE2: # %bb.0: +; SSE2-NEXT: movq 24(%rdi), %rax +; SSE2-NEXT: movdqu 8(%rdi), %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: psrad $16, %xmm1 +; SSE2-NEXT: cvtdq2ps %xmm1, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] +; SSE2-NEXT: psrad $16, %xmm0 +; SSE2-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE2-NEXT: movaps %xmm0, 16(%rax) +; SSE2-NEXT: movaps %xmm1, (%rax) +; SSE2-NEXT: retq +; +; SSE41-LABEL: aggregate_sitofp_8i16_to_8f32: +; SSE41: # %bb.0: +; SSE41-NEXT: movq 24(%rdi), %rax +; SSE41-NEXT: movdqu 8(%rdi), %xmm0 +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE41-NEXT: pmovsxwd %xmm1, %xmm1 +; SSE41-NEXT: cvtdq2ps %xmm1, %xmm1 +; SSE41-NEXT: pmovsxwd %xmm0, %xmm0 +; SSE41-NEXT: cvtdq2ps %xmm0, %xmm0 +; SSE41-NEXT: movaps %xmm0, (%rax) +; SSE41-NEXT: movaps %xmm1, 16(%rax) +; SSE41-NEXT: retq ; ; AVX1-LABEL: aggregate_sitofp_8i16_to_8f32: ; AVX1: # %bb.0: